From 5d458a3df7e58e6551d8ec72979e376c62fdd2f7 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 29 Jan 2020 21:52:33 -0800 Subject: fix some transform bugs, add some tests --- python/tests/transform_elasticsearch.py | 114 ++++++++++++++++++++++++++++++++ 1 file changed, 114 insertions(+) create mode 100644 python/tests/transform_elasticsearch.py (limited to 'python/tests/transform_elasticsearch.py') diff --git a/python/tests/transform_elasticsearch.py b/python/tests/transform_elasticsearch.py new file mode 100644 index 00000000..ab613a0a --- /dev/null +++ b/python/tests/transform_elasticsearch.py @@ -0,0 +1,114 @@ + +import json +import pytest +from fatcat_tools import * +from fatcat_openapi_client import * +from fixtures import api +from import_journal_metadata import journal_metadata_importer + +from import_crossref import crossref_importer +from import_matched import matched_importer + +def test_basic_elasticsearch_convert(crossref_importer): + with open('tests/files/crossref-works.single.json', 'r') as f: + # not a single line + raw = json.loads(f.read()) + r = crossref_importer.parse_record(raw) + r.state = 'active' + release_to_elasticsearch(r) + +def test_rich_elasticsearch_convert(): + r = ReleaseEntity( + title="something", + release_year=1234, + license_slug="CC-BY-NC", + ext_ids=ReleaseExtIds(), + refs=[ + ReleaseRef(), + ReleaseRef(target_release_id="iznnn644szdwva7khyxqzc73bi"), + ], + ) + r.state = 'active' + r.container = ContainerEntity( + name="dummy journal", + extra={ + "ia": { + "sim": { + "year_spans": [[1000, 1100]], + }, + }, + "kbart": { + "lockss": { + "year_spans": [[1200, 1300]], + }, + "jstor": { + "year_spans": [[1950, 1960], [1980, 2005]], + }, + }, + "sherpa_romeo": {"color": "blue"}, + "doaj": {"as_of": "2010-02-03"}, + }, + ) + r.files = [FileEntity( + mimetype="application/pdf", + urls=[ + FileUrl(rel="dweb", url="dat://a954329dlk/thingie"), + FileUrl(rel="webarchive", url="https://web.archive.org/web/20001122030405/http://example.com"), + FileUrl(rel="web", url="https://archive.org/details/blah/file.pdf"), + ], + extra={ + "shadows": {}, + }, + )] + es = release_to_elasticsearch(r) + assert es['release_year'] == r.release_year + assert es['in_ia'] == True + assert es['in_jstor'] == False + assert es['in_ia_sim'] == False + assert es['in_ia'] == True + assert es['in_web'] == True + assert es['in_dweb'] == True + assert es['is_oa'] == True + assert es['is_longtail_oa'] == False + assert es['ref_count'] == 2 + assert es['ref_linked_count'] == 1 + +def test_elasticsearch_release_from_json(): + r = entity_from_json(open('./tests/files/release_etodop5banbndg3faecnfm6ozi.json', 'r').read(), ReleaseEntity) + es = release_to_elasticsearch(r) + + assert es['subtitle'] == "Correpondence" + assert es['ident'] == "etodop5banbndg3faecnfm6ozi" + assert es['container_name'] == "BJOG: an International Journal of Obstetrics and Gynaecology" + assert es['first_page'] == "1404" + assert es['issue'] == "11" + assert es['volume'] == "118" + assert es['number'] == None + assert es['in_ia_sim'] == True + assert es['in_kbart'] == True + +def test_elasticsearch_container_transform(journal_metadata_importer): + with open('tests/files/journal_metadata.sample.json', 'r') as f: + raw = json.loads(f.readline()) + c = journal_metadata_importer.parse_record(raw) + c.state = 'active' + es = container_to_elasticsearch(c) + assert es['publisher'] == c.publisher + +def test_elasticsearch_file_transform(matched_importer): + f = entity_from_json(open('./tests/files/file_bcah4zp5tvdhjl5bqci2c2lgfa.json', 'r').read(), FileEntity) + + f.state = 'active' + es = file_to_elasticsearch(f) + assert es['sha1'] == f.sha1 + assert es['sha256'] == f.sha256 + assert es['md5'] == f.md5 + assert es['size_bytes'] == f.size + assert es['mimetype'] == f.mimetype + assert es['in_ia'] == True + assert 'publisher' in es['rel'] + + # XXX: implement hosts and domain parsing with urlcanon + #assert 'journals.plos.org' in es['host'] + #assert 'plos.org' in es['domain'] + -- cgit v1.2.3 From d58c3891ac2122dac53ced606568108f543f2d80 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 29 Jan 2020 21:52:58 -0800 Subject: actually implement changelog transform --- extra/elasticsearch/changelog_schema.json | 11 ++++- python/fatcat_tools/transforms/elasticsearch.py | 62 ++++++++++++++++++------- python/tests/transform_elasticsearch.py | 24 +++++++++- 3 files changed, 78 insertions(+), 19 deletions(-) (limited to 'python/tests/transform_elasticsearch.py') diff --git a/extra/elasticsearch/changelog_schema.json b/extra/elasticsearch/changelog_schema.json index f3211e99..77c77238 100644 --- a/extra/elasticsearch/changelog_schema.json +++ b/extra/elasticsearch/changelog_schema.json @@ -16,20 +16,29 @@ "changelog": { "properties": { "index": { "type": "integer" }, - "editgroup_id": { "type": "keyword" }, + "editgroup_id": { "type": "keyword", "doc_values": false }, "timestamp": { "type": "date" }, "editor_id": { "type": "keyword" }, "username": { "type": "keyword" }, "is_bot": { "type": "boolean" }, "is_admin": { "type": "boolean" }, "agent": { "type": "keyword" }, + "containers": { "type": "integer" }, + "new_containers": { "type": "integer" }, "creators": { "type": "integer" }, + "new_creators": { "type": "integer" }, "files": { "type": "integer" }, + "new_files": { "type": "integer" }, "filessets": { "type": "integer" }, + "new_filessets": { "type": "integer" }, "webcaptures": { "type": "integer" }, + "new_webcaptures": { "type": "integer" }, "releases": { "type": "integer" }, + "new_releases": { "type": "integer" }, "works": { "type": "integer" }, + "new_works": { "type": "integer" }, + "created": { "type": "integer" }, "updated": { "type": "integer" }, "deleted": { "type": "integer" }, diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py index 812cd1fd..c8547b27 100644 --- a/python/fatcat_tools/transforms/elasticsearch.py +++ b/python/fatcat_tools/transforms/elasticsearch.py @@ -401,36 +401,64 @@ def container_to_elasticsearch(entity, force_bool=True): return t +def _type_of_edit(edit): + if edit.revision == None and edit.redirect_ident == None: + return 'delete' + elif edit.redirect_ident: + # redirect + return 'update' + elif edit.prev_revision == None and edit.redirect_ident == None and edit.revision: + return 'create' + else: + return 'update' + + def changelog_to_elasticsearch(entity): editgroup = entity.editgroup t = dict( index=entity.index, editgroup_id=entity.editgroup_id, - timestamp=entity.timestamp, + timestamp=entity.timestamp.isoformat(), editor_id=editgroup.editor_id, + username=editgroup.editor.username, + is_bot=editgroup.editor.is_bot, + is_admin=editgroup.editor.is_admin, ) extra = editgroup.extra or dict() if extra.get('agent'): t['agent'] = extra['agent'] - t['containers'] = len(editgroup.edits.containers) - t['creators'] = len(editgroup.edits.containers) - t['files'] = len(editgroup.edits.containers) - t['filesets'] = len(editgroup.edits.containers) - t['webcaptures'] = len(editgroup.edits.containers) - t['releases'] = len(editgroup.edits.containers) - t['works'] = len(editgroup.edits.containers) - - # TODO: parse and pull out counts - #created = 0 - #updated = 0 - #deleted = 0 - #t['created'] = created - #t['updated'] = updated - #t['deleted'] = deleted - #t['total'] = created + updated + deleted + containers = [_type_of_edit(e) for e in editgroup.edits.containers] + creators = [_type_of_edit(e) for e in editgroup.edits.creators] + files = [_type_of_edit(e) for e in editgroup.edits.files] + filesets = [_type_of_edit(e) for e in editgroup.edits.filesets] + webcaptures = [_type_of_edit(e) for e in editgroup.edits.webcaptures] + releases = [_type_of_edit(e) for e in editgroup.edits.releases] + works = [_type_of_edit(e) for e in editgroup.edits.works] + + t['containers'] = len(containers) + t['new_containers'] = len([e for e in containers if e == 'create']) + t['creators'] = len(creators) + t['new_creators'] = len([e for e in creators if e == 'create']) + t['files'] = len(files) + t['new_files'] = len([e for e in files if e == 'create']) + t['filesets'] = len(filesets) + t['new_filesets'] = len([e for e in filesets if e == 'create']) + t['webcaptures'] = len(webcaptures) + t['new_webcaptures'] = len([e for e in webcaptures if e == 'create']) + t['releases'] = len(releases) + t['new_releases'] = len([e for e in releases if e == 'create']) + t['works'] = len(works) + t['new_works'] = len([e for e in works if e == 'create']) + + all_edits = containers + creators + files + filesets + webcaptures + releases + works + + t['created'] = len([e for e in all_edits if e == 'create']) + t['updated'] = len([e for e in all_edits if e == 'update']) + t['deleted'] = len([e for e in all_edits if e == 'delete']) + t['total'] = len(all_edits) return t diff --git a/python/tests/transform_elasticsearch.py b/python/tests/transform_elasticsearch.py index ab613a0a..89a4eef8 100644 --- a/python/tests/transform_elasticsearch.py +++ b/python/tests/transform_elasticsearch.py @@ -106,9 +106,31 @@ def test_elasticsearch_file_transform(matched_importer): assert es['size_bytes'] == f.size assert es['mimetype'] == f.mimetype assert es['in_ia'] == True - assert 'publisher' in es['rel'] + assert 'web' in es['rel'] # XXX: implement hosts and domain parsing with urlcanon #assert 'journals.plos.org' in es['host'] #assert 'plos.org' in es['domain'] +def test_elasticsearch_changelog_transform(matched_importer): + ce = entity_from_json(open('./tests/files/changelog_3469683.json', 'r').read(), ChangelogEntry) + + es = changelog_to_elasticsearch(ce) + assert es['index'] == 3469683 + # len("2020-01-30T05:04:39") => 19 + assert es['timestamp'][:19] == "2020-01-30T05:04:39.738601Z"[:19] + assert es['editor_id'] == "scmbogxw25evtcesfcab5qaboa" + assert es['username'] == "crawl-bot" + assert es['is_bot'] == True + assert es['is_admin'] == True + assert es['agent'] == "fatcat_tools.IngestFileResultImporter" + + assert es['total'] == 50 + assert es['files'] == 50 + assert es['new_files'] == 50 + assert es['created'] == 50 + + assert es['releases'] == 0 + assert es['new_releases'] == 0 + assert es['updated'] == 0 + assert es['deleted'] == 0 -- cgit v1.2.3 From ade1eb9ff955ca5ba58acdc8b76e344c9cc54790 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 29 Jan 2020 23:56:27 -0800 Subject: fix ES file schema plural field names --- python/fatcat_tools/transforms/elasticsearch.py | 7 +++---- python/tests/transform_elasticsearch.py | 2 +- 2 files changed, 4 insertions(+), 5 deletions(-) (limited to 'python/tests/transform_elasticsearch.py') diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py index 42669bbf..5a492fb4 100644 --- a/python/fatcat_tools/transforms/elasticsearch.py +++ b/python/fatcat_tools/transforms/elasticsearch.py @@ -497,13 +497,12 @@ def file_to_elasticsearch(entity): sha1 = entity.sha1, sha256 = entity.sha256, md5 = entity.md5, - rel = [u.rel for u in entity.urls], ) # TODO: domain, hosts (from urls; use proper urlcanon) - t['rel'] = list(set([u.rel for u in entity.urls])) - t['host'] = [] - t['domain'] = [] + t['rels'] = list(set([u.rel for u in entity.urls])) + t['hosts'] = [] + t['domains'] = [] in_ia = False for u in entity.urls: diff --git a/python/tests/transform_elasticsearch.py b/python/tests/transform_elasticsearch.py index 89a4eef8..c247e745 100644 --- a/python/tests/transform_elasticsearch.py +++ b/python/tests/transform_elasticsearch.py @@ -106,7 +106,7 @@ def test_elasticsearch_file_transform(matched_importer): assert es['size_bytes'] == f.size assert es['mimetype'] == f.mimetype assert es['in_ia'] == True - assert 'web' in es['rel'] + assert 'web' in es['rels'] # XXX: implement hosts and domain parsing with urlcanon #assert 'journals.plos.org' in es['host'] -- cgit v1.2.3 From 4cbee44529dd967c966ed3f2cc2bb80176be4e43 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 30 Jan 2020 00:08:41 -0800 Subject: implement host+domain parsing for file ES transform --- python/fatcat_tools/transforms/elasticsearch.py | 14 +++++--------- python/tests/transform_elasticsearch.py | 7 +++---- 2 files changed, 8 insertions(+), 13 deletions(-) (limited to 'python/tests/transform_elasticsearch.py') diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py index 5a492fb4..e1980d90 100644 --- a/python/fatcat_tools/transforms/elasticsearch.py +++ b/python/fatcat_tools/transforms/elasticsearch.py @@ -1,6 +1,6 @@ - import collections +import tldextract from fatcat_openapi_client import ApiClient @@ -499,15 +499,11 @@ def file_to_elasticsearch(entity): md5 = entity.md5, ) - # TODO: domain, hosts (from urls; use proper urlcanon) + parsed_urls = [tldextract.extract(u.url) for u in entity.urls] + t['hosts'] = list(set(['.'.join(pu) for pu in parsed_urls])) + t['domains'] = list(set([pu.registered_domain for pu in parsed_urls])) t['rels'] = list(set([u.rel for u in entity.urls])) - t['hosts'] = [] - t['domains'] = [] - in_ia = False - for u in entity.urls: - if '://archive.org/' in u.url or '://web.archive.org/' in u.url: - in_ia = True - t['in_ia'] = bool(in_ia) + t['in_ia'] = bool('archive.org' in t['domains']) return t diff --git a/python/tests/transform_elasticsearch.py b/python/tests/transform_elasticsearch.py index c247e745..e67681c6 100644 --- a/python/tests/transform_elasticsearch.py +++ b/python/tests/transform_elasticsearch.py @@ -106,11 +106,10 @@ def test_elasticsearch_file_transform(matched_importer): assert es['size_bytes'] == f.size assert es['mimetype'] == f.mimetype assert es['in_ia'] == True - assert 'web' in es['rels'] - # XXX: implement hosts and domain parsing with urlcanon - #assert 'journals.plos.org' in es['host'] - #assert 'plos.org' in es['domain'] + assert 'web' in es['rels'] + assert 'www.zhros.ru' in es['hosts'] + assert 'zhros.ru' in es['domains'] def test_elasticsearch_changelog_transform(matched_importer): ce = entity_from_json(open('./tests/files/changelog_3469683.json', 'r').read(), ChangelogEntry) -- cgit v1.2.3 From 741c7b1efae5e39f3ee2c082e3ca28c6c5c85b76 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 31 Jan 2020 13:31:59 -0800 Subject: ES releases: host/domain fixes --- python/fatcat_tools/transforms/elasticsearch.py | 4 ++-- python/tests/transform_elasticsearch.py | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'python/tests/transform_elasticsearch.py') diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py index b5abe2ae..f8bc05fb 100644 --- a/python/fatcat_tools/transforms/elasticsearch.py +++ b/python/fatcat_tools/transforms/elasticsearch.py @@ -502,7 +502,7 @@ def file_to_elasticsearch(entity): ) parsed_urls = [tldextract.extract(u.url) for u in entity.urls] - t['hosts'] = list(set(['.'.join(pu) for pu in parsed_urls])) + t['hosts'] = list(set(['.'.join([seg for seg in pu if seg]) for pu in parsed_urls])) t['domains'] = list(set([pu.registered_domain for pu in parsed_urls])) t['rels'] = list(set([u.rel for u in entity.urls])) @@ -512,6 +512,6 @@ def file_to_elasticsearch(entity): # ok, but actually remove archive.org hosts, because they make other # aggregations hard and are a waste of storage t['hosts'] = [h for h in t['hosts'] if h not in ('archive.org', 'web.archive.org')] - t['domains'] = [h for h in t['hosts'] if h not in ('archive.org')] + t['domains'] = [h for h in t['domains'] if h not in ('archive.org')] return t diff --git a/python/tests/transform_elasticsearch.py b/python/tests/transform_elasticsearch.py index e67681c6..c94ab375 100644 --- a/python/tests/transform_elasticsearch.py +++ b/python/tests/transform_elasticsearch.py @@ -110,6 +110,9 @@ def test_elasticsearch_file_transform(matched_importer): assert 'web' in es['rels'] assert 'www.zhros.ru' in es['hosts'] assert 'zhros.ru' in es['domains'] + assert not '.archive.org' in (es['hosts'] + es['domains']) + assert not 'archive.org' in (es['hosts'] + es['domains']) + assert not 'web.archive.org' in (es['hosts'] + es['domains']) def test_elasticsearch_changelog_transform(matched_importer): ce = entity_from_json(open('./tests/files/changelog_3469683.json', 'r').read(), ChangelogEntry) -- cgit v1.2.3 From ed38bfde4e1eaddd6d710802b6f372c7b0aab26b Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 14 Feb 2020 00:07:56 -0800 Subject: ES updates: fix tests to accept archive.org in host/domain --- python/tests/transform_elasticsearch.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'python/tests/transform_elasticsearch.py') diff --git a/python/tests/transform_elasticsearch.py b/python/tests/transform_elasticsearch.py index c94ab375..a954fc4d 100644 --- a/python/tests/transform_elasticsearch.py +++ b/python/tests/transform_elasticsearch.py @@ -110,9 +110,10 @@ def test_elasticsearch_file_transform(matched_importer): assert 'web' in es['rels'] assert 'www.zhros.ru' in es['hosts'] assert 'zhros.ru' in es['domains'] + assert 'archive.org' in (es['hosts'] + es['domains']) + assert 'web.archive.org' in (es['hosts'] + es['domains']) + # old regression assert not '.archive.org' in (es['hosts'] + es['domains']) - assert not 'archive.org' in (es['hosts'] + es['domains']) - assert not 'web.archive.org' in (es['hosts'] + es['domains']) def test_elasticsearch_changelog_transform(matched_importer): ce = entity_from_json(open('./tests/files/changelog_3469683.json', 'r').read(), ChangelogEntry) -- cgit v1.2.3