diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2021-11-02 18:13:14 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2021-11-02 18:13:14 -0700 |
commit | cdfd6b85b386b7bbf9d5a5179ef26970b6e5a4e7 (patch) | |
tree | 5e4034027b51f3ee4d2a488bb2cbb7a75c3bd0d8 /python/tests/transform_elasticsearch.py | |
parent | 78f08280edea4ff65ca613ad30005c45cc48dea6 (diff) | |
download | fatcat-cdfd6b85b386b7bbf9d5a5179ef26970b6e5a4e7.tar.gz fatcat-cdfd6b85b386b7bbf9d5a5179ef26970b6e5a4e7.zip |
fmt (black): tests/
Diffstat (limited to 'python/tests/transform_elasticsearch.py')
-rw-r--r-- | python/tests/transform_elasticsearch.py | 341 |
1 files changed, 178 insertions, 163 deletions
diff --git a/python/tests/transform_elasticsearch.py b/python/tests/transform_elasticsearch.py index cee37867..082a4e99 100644 --- a/python/tests/transform_elasticsearch.py +++ b/python/tests/transform_elasticsearch.py @@ -24,13 +24,14 @@ from fatcat_tools.transforms import ( def test_basic_elasticsearch_convert(crossref_importer): - with open('tests/files/crossref-works.single.json', 'r') as f: + with open("tests/files/crossref-works.single.json", "r") as f: # not a single line raw = json.loads(f.read()) r = crossref_importer.parse_record(raw) - r.state = 'active' + r.state = "active" release_to_elasticsearch(r) + def test_rich_elasticsearch_convert(): r = ReleaseEntity( title="something", @@ -42,7 +43,7 @@ def test_rich_elasticsearch_convert(): ReleaseRef(target_release_id="iznnn644szdwva7khyxqzc73bi"), ], ) - r.state = 'active' + r.state = "active" r.container = ContainerEntity( name="dummy journal", extra={ @@ -63,117 +64,132 @@ def test_rich_elasticsearch_convert(): "doaj": {"as_of": "2010-02-03"}, }, ) - r.files = [FileEntity( - mimetype="application/pdf", - urls=[ - FileUrl(rel="dweb", url="dat://a954329dlk/thingie"), - FileUrl(rel="webarchive", url="https://web.archive.org/web/20001122030405/http://example.com"), - FileUrl(rel="web", url="https://archive.org/details/blah/file.pdf"), - ], - extra={ - "shadows": {}, - }, - )] + r.files = [ + FileEntity( + mimetype="application/pdf", + urls=[ + FileUrl(rel="dweb", url="dat://a954329dlk/thingie"), + FileUrl( + rel="webarchive", + url="https://web.archive.org/web/20001122030405/http://example.com", + ), + FileUrl(rel="web", url="https://archive.org/details/blah/file.pdf"), + ], + extra={ + "shadows": {}, + }, + ) + ] es = release_to_elasticsearch(r) - assert es['release_year'] == r.release_year - assert es['file_count'] == 1 - assert es['fileset_count'] == 0 - assert es['webcapture_count'] == 0 - assert es['ref_count'] == 2 - assert es['ref_linked_count'] == 1 - - assert es['preservation'] == "bright" - assert es['is_oa'] is True - assert es['is_longtail_oa'] is False - assert es['is_preserved'] is True - assert es['in_web'] is True - assert es['in_dweb'] is True - assert es['in_ia'] is True - assert es['in_ia_sim'] is False - assert es['in_kbart'] is True - assert es['in_jstor'] is True + assert es["release_year"] == r.release_year + assert es["file_count"] == 1 + assert es["fileset_count"] == 0 + assert es["webcapture_count"] == 0 + assert es["ref_count"] == 2 + assert es["ref_linked_count"] == 1 + + assert es["preservation"] == "bright" + assert es["is_oa"] is True + assert es["is_longtail_oa"] is False + assert es["is_preserved"] is True + assert es["in_web"] is True + assert es["in_dweb"] is True + assert es["in_ia"] is True + assert es["in_ia_sim"] is False + assert es["in_kbart"] is True + assert es["in_jstor"] is True + def test_elasticsearch_release_from_json(): - r = entity_from_json(open('./tests/files/release_etodop5banbndg3faecnfm6ozi.json', 'r').read(), ReleaseEntity) + r = entity_from_json( + open("./tests/files/release_etodop5banbndg3faecnfm6ozi.json", "r").read(), ReleaseEntity + ) es = release_to_elasticsearch(r) - assert es['subtitle'] == "Correpondence" - assert es['ident'] == "etodop5banbndg3faecnfm6ozi" - assert es['container_name'] == "BJOG: an International Journal of Obstetrics and Gynaecology" - assert es['first_page'] == "1404" - assert es['issue'] == "11" - assert es['volume'] == "118" - assert es['number'] is None - - assert es['preservation'] == "dark" - assert es['is_oa'] is False - assert es['is_longtail_oa'] is False - assert es['is_preserved'] is True - assert es['in_web'] is False - assert es['in_dweb'] is False - assert es['in_ia'] is False - assert es['in_ia_sim'] is True - assert es['in_kbart'] is True - assert es['in_jstor'] is False + assert es["subtitle"] == "Correpondence" + assert es["ident"] == "etodop5banbndg3faecnfm6ozi" + assert ( + es["container_name"] == "BJOG: an International Journal of Obstetrics and Gynaecology" + ) + assert es["first_page"] == "1404" + assert es["issue"] == "11" + assert es["volume"] == "118" + assert es["number"] is None + + assert es["preservation"] == "dark" + assert es["is_oa"] is False + assert es["is_longtail_oa"] is False + assert es["is_preserved"] is True + assert es["in_web"] is False + assert es["in_dweb"] is False + assert es["in_ia"] is False + assert es["in_ia_sim"] is True + assert es["in_kbart"] is True + assert es["in_jstor"] is False # this release has a fileset, and no file - r = entity_from_json(open('./tests/files/release_3mssw2qnlnblbk7oqyv2dafgey.json', 'r').read(), ReleaseEntity) + r = entity_from_json( + open("./tests/files/release_3mssw2qnlnblbk7oqyv2dafgey.json", "r").read(), ReleaseEntity + ) es = release_to_elasticsearch(r) - assert es['title'] == "Jakobshavn Glacier Bed Elevation" - assert es['ident'] == "3mssw2qnlnblbk7oqyv2dafgey" - assert es['file_count'] == 0 - assert es['fileset_count'] == 1 - assert es['webcapture_count'] == 0 - - assert es['preservation'] == "dark" - assert es['is_oa'] is True - assert es['is_longtail_oa'] is False - assert es['is_preserved'] is True - assert es['in_web'] is True - assert es['in_dweb'] is True - assert es['in_ia'] is False - assert es['in_ia_sim'] is False - assert es['in_kbart'] is False - assert es['in_jstor'] is False + assert es["title"] == "Jakobshavn Glacier Bed Elevation" + assert es["ident"] == "3mssw2qnlnblbk7oqyv2dafgey" + assert es["file_count"] == 0 + assert es["fileset_count"] == 1 + assert es["webcapture_count"] == 0 + + assert es["preservation"] == "dark" + assert es["is_oa"] is True + assert es["is_longtail_oa"] is False + assert es["is_preserved"] is True + assert es["in_web"] is True + assert es["in_dweb"] is True + assert es["in_ia"] is False + assert es["in_ia_sim"] is False + assert es["in_kbart"] is False + assert es["in_jstor"] is False # this release has a web capture, and no file (edited the JSON to remove file) - r = entity_from_json(open('./tests/files/release_mjtqtuyhwfdr7j2c3l36uor7uy.json', 'r').read(), ReleaseEntity) + r = entity_from_json( + open("./tests/files/release_mjtqtuyhwfdr7j2c3l36uor7uy.json", "r").read(), ReleaseEntity + ) es = release_to_elasticsearch(r) - assert es['title'] == "Rethinking Personal Digital Archiving, Part 1" - assert es['ident'] == "mjtqtuyhwfdr7j2c3l36uor7uy" - assert es['file_count'] == 0 - assert es['fileset_count'] == 0 - assert es['webcapture_count'] == 1 - - assert es['preservation'] == "bright" - assert es['is_oa'] is True - assert es['is_longtail_oa'] is False - assert es['is_preserved'] is True - assert es['in_web'] is True - assert es['in_dweb'] is False - assert es['in_ia'] is True - assert es['in_ia_sim'] is False - assert es['in_kbart'] is False - assert es['in_jstor'] is False + assert es["title"] == "Rethinking Personal Digital Archiving, Part 1" + assert es["ident"] == "mjtqtuyhwfdr7j2c3l36uor7uy" + assert es["file_count"] == 0 + assert es["fileset_count"] == 0 + assert es["webcapture_count"] == 1 + + assert es["preservation"] == "bright" + assert es["is_oa"] is True + assert es["is_longtail_oa"] is False + assert es["is_preserved"] is True + assert es["in_web"] is True + assert es["in_dweb"] is False + assert es["in_ia"] is True + assert es["in_ia_sim"] is False + assert es["in_kbart"] is False + assert es["in_jstor"] is False + def test_elasticsearch_container_transform(journal_metadata_importer): - with open('tests/files/journal_metadata.sample.json', 'r') as f: + with open("tests/files/journal_metadata.sample.json", "r") as f: raw1 = json.loads(f.readline()) raw2 = json.loads(f.readline()) c1 = journal_metadata_importer.parse_record(raw1) - c1.state = 'active' + c1.state = "active" c2 = journal_metadata_importer.parse_record(raw2) - c2.state = 'active' + c2.state = "active" - c1.extra['publisher_type'] = "big5" - c1.extra['discipline'] = "history" + c1.extra["publisher_type"] = "big5" + c1.extra["discipline"] = "history" es = container_to_elasticsearch(c1) - assert es['publisher'] == c1.publisher - assert es['discipline'] == c1.extra['discipline'] - assert es['publisher_type'] == c1.extra['publisher_type'] - assert es['keepers'] == [] + assert es["publisher"] == c1.publisher + assert es["discipline"] == c1.extra["discipline"] + assert es["publisher_type"] == c1.extra["publisher_type"] + assert es["keepers"] == [] stats = { "ident": "en4qj5ijrbf5djxx7p5zzpjyoq", @@ -186,71 +202,70 @@ def test_elasticsearch_container_transform(journal_metadata_importer): "dark": 1635, "none": 0, "shadows_only": 0, - "total": 11136 + "total": 11136, }, - "release_type": { - "_unknown": 9, - "article-journal": 11124, - "editorial": 2, - "letter": 1 - }, - "total": 11136 + "release_type": {"_unknown": 9, "article-journal": 11124, "editorial": 2, "letter": 1}, + "total": 11136, } es = container_to_elasticsearch(c2, stats=stats) - assert es['name'] == c2.name - assert es['publisher'] == c2.publisher - assert es['keepers'] == list(c2.extra['kbart'].keys()) == ["portico"] - assert es['any_kbart'] is True + assert es["name"] == c2.name + assert es["publisher"] == c2.publisher + assert es["keepers"] == list(c2.extra["kbart"].keys()) == ["portico"] + assert es["any_kbart"] is True def test_elasticsearch_file_transform(): - with open('./tests/files/file_bcah4zp5tvdhjl5bqci2c2lgfa.json', 'r') as f: + with open("./tests/files/file_bcah4zp5tvdhjl5bqci2c2lgfa.json", "r") as f: json_str = f.read() fe = entity_from_json(json_str, FileEntity) - fe.state = 'active' + fe.state = "active" es = file_to_elasticsearch(fe) # pylint infers type of 'fe' incorrectly for some reason (as str/bytes) - assert es['sha1'] == fe.sha1 # pylint: disable=no-member - assert es['sha256'] == fe.sha256 # pylint: disable=no-member - assert es['md5'] == fe.md5 # pylint: disable=no-member - assert es['size_bytes'] == fe.size # pylint: disable=no-member - assert es['mimetype'] == fe.mimetype # pylint: disable=no-member - assert es['in_ia'] is True - - assert 'web' in es['rels'] - assert 'www.zhros.ru' in es['hosts'] - assert 'zhros.ru' in es['domains'] - assert 'archive.org' in (es['hosts'] + es['domains']) - assert 'web.archive.org' in (es['hosts'] + es['domains']) + assert es["sha1"] == fe.sha1 # pylint: disable=no-member + assert es["sha256"] == fe.sha256 # pylint: disable=no-member + assert es["md5"] == fe.md5 # pylint: disable=no-member + assert es["size_bytes"] == fe.size # pylint: disable=no-member + assert es["mimetype"] == fe.mimetype # pylint: disable=no-member + assert es["in_ia"] is True + + assert "web" in es["rels"] + assert "www.zhros.ru" in es["hosts"] + assert "zhros.ru" in es["domains"] + assert "archive.org" in (es["hosts"] + es["domains"]) + assert "web.archive.org" in (es["hosts"] + es["domains"]) # old regression - assert '.archive.org' not in (es['hosts'] + es['domains']) + assert ".archive.org" not in (es["hosts"] + es["domains"]) + def test_elasticsearch_changelog_transform(): - ce = entity_from_json(open('./tests/files/changelog_3469683.json', 'r').read(), ChangelogEntry) + ce = entity_from_json( + open("./tests/files/changelog_3469683.json", "r").read(), ChangelogEntry + ) es = changelog_to_elasticsearch(ce) - assert es['index'] == 3469683 + assert es["index"] == 3469683 # len("2020-01-30T05:04:39") => 19 - assert es['timestamp'][:19] == "2020-01-30T05:04:39.738601Z"[:19] - assert es['editor_id'] == "scmbogxw25evtcesfcab5qaboa" - assert es['username'] == "crawl-bot" - assert es['is_bot'] is True - assert es['is_admin'] is True - assert es['agent'] == "fatcat_tools.IngestFileResultImporter" - - assert es['total'] == 50 - assert es['files'] == 50 - assert es['new_files'] == 50 - assert es['created'] == 50 - - assert es['releases'] == 0 - assert es['new_releases'] == 0 - assert es['updated'] == 0 - assert es['deleted'] == 0 + assert es["timestamp"][:19] == "2020-01-30T05:04:39.738601Z"[:19] + assert es["editor_id"] == "scmbogxw25evtcesfcab5qaboa" + assert es["username"] == "crawl-bot" + assert es["is_bot"] is True + assert es["is_admin"] is True + assert es["agent"] == "fatcat_tools.IngestFileResultImporter" + + assert es["total"] == 50 + assert es["files"] == 50 + assert es["new_files"] == 50 + assert es["created"] == 50 + + assert es["releases"] == 0 + assert es["new_releases"] == 0 + assert es["updated"] == 0 + assert es["deleted"] == 0 + def test_elasticsearch_release_kbart_year(): this_year = datetime.date.today().year @@ -264,7 +279,7 @@ def test_elasticsearch_release_kbart_year(): ReleaseRef(target_release_id="iznnn644szdwva7khyxqzc73bi"), ], ) - r.state = 'active' + r.state = "active" r.container = ContainerEntity( name="dummy journal", extra={ @@ -276,18 +291,18 @@ def test_elasticsearch_release_kbart_year(): }, ) es = release_to_elasticsearch(r) - assert es['release_year'] == this_year - - assert es['preservation'] == "none" - assert es['is_oa'] is True - assert es['is_longtail_oa'] is False - assert es['is_preserved'] is False - assert es['in_web'] is False - assert es['in_dweb'] is False - assert es['in_ia'] is False - assert es['in_ia_sim'] is False - assert es['in_kbart'] is False - assert es['in_jstor'] is False + assert es["release_year"] == this_year + + assert es["preservation"] == "none" + assert es["is_oa"] is True + assert es["is_longtail_oa"] is False + assert es["is_preserved"] is False + assert es["in_web"] is False + assert es["in_dweb"] is False + assert es["in_ia"] is False + assert es["in_ia_sim"] is False + assert es["in_kbart"] is False + assert es["in_jstor"] is False r.container = ContainerEntity( name="dummy journal", @@ -300,15 +315,15 @@ def test_elasticsearch_release_kbart_year(): }, ) es = release_to_elasticsearch(r) - assert es['release_year'] == this_year - - assert es['preservation'] == "dark" - assert es['is_oa'] is True - assert es['is_longtail_oa'] is False - assert es['is_preserved'] is True - assert es['in_web'] is False - assert es['in_dweb'] is False - assert es['in_ia'] is False - assert es['in_ia_sim'] is False - assert es['in_kbart'] is True - assert es['in_jstor'] is False + assert es["release_year"] == this_year + + assert es["preservation"] == "dark" + assert es["is_oa"] is True + assert es["is_longtail_oa"] is False + assert es["is_preserved"] is True + assert es["in_web"] is False + assert es["in_dweb"] is False + assert es["in_ia"] is False + assert es["in_ia_sim"] is False + assert es["in_kbart"] is True + assert es["in_jstor"] is False |