aboutsummaryrefslogtreecommitdiffstats
path: root/python/tests/transform_elasticsearch.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/tests/transform_elasticsearch.py')
-rw-r--r--python/tests/transform_elasticsearch.py341
1 files changed, 178 insertions, 163 deletions
diff --git a/python/tests/transform_elasticsearch.py b/python/tests/transform_elasticsearch.py
index cee37867..082a4e99 100644
--- a/python/tests/transform_elasticsearch.py
+++ b/python/tests/transform_elasticsearch.py
@@ -24,13 +24,14 @@ from fatcat_tools.transforms import (
def test_basic_elasticsearch_convert(crossref_importer):
- with open('tests/files/crossref-works.single.json', 'r') as f:
+ with open("tests/files/crossref-works.single.json", "r") as f:
# not a single line
raw = json.loads(f.read())
r = crossref_importer.parse_record(raw)
- r.state = 'active'
+ r.state = "active"
release_to_elasticsearch(r)
+
def test_rich_elasticsearch_convert():
r = ReleaseEntity(
title="something",
@@ -42,7 +43,7 @@ def test_rich_elasticsearch_convert():
ReleaseRef(target_release_id="iznnn644szdwva7khyxqzc73bi"),
],
)
- r.state = 'active'
+ r.state = "active"
r.container = ContainerEntity(
name="dummy journal",
extra={
@@ -63,117 +64,132 @@ def test_rich_elasticsearch_convert():
"doaj": {"as_of": "2010-02-03"},
},
)
- r.files = [FileEntity(
- mimetype="application/pdf",
- urls=[
- FileUrl(rel="dweb", url="dat://a954329dlk/thingie"),
- FileUrl(rel="webarchive", url="https://web.archive.org/web/20001122030405/http://example.com"),
- FileUrl(rel="web", url="https://archive.org/details/blah/file.pdf"),
- ],
- extra={
- "shadows": {},
- },
- )]
+ r.files = [
+ FileEntity(
+ mimetype="application/pdf",
+ urls=[
+ FileUrl(rel="dweb", url="dat://a954329dlk/thingie"),
+ FileUrl(
+ rel="webarchive",
+ url="https://web.archive.org/web/20001122030405/http://example.com",
+ ),
+ FileUrl(rel="web", url="https://archive.org/details/blah/file.pdf"),
+ ],
+ extra={
+ "shadows": {},
+ },
+ )
+ ]
es = release_to_elasticsearch(r)
- assert es['release_year'] == r.release_year
- assert es['file_count'] == 1
- assert es['fileset_count'] == 0
- assert es['webcapture_count'] == 0
- assert es['ref_count'] == 2
- assert es['ref_linked_count'] == 1
-
- assert es['preservation'] == "bright"
- assert es['is_oa'] is True
- assert es['is_longtail_oa'] is False
- assert es['is_preserved'] is True
- assert es['in_web'] is True
- assert es['in_dweb'] is True
- assert es['in_ia'] is True
- assert es['in_ia_sim'] is False
- assert es['in_kbart'] is True
- assert es['in_jstor'] is True
+ assert es["release_year"] == r.release_year
+ assert es["file_count"] == 1
+ assert es["fileset_count"] == 0
+ assert es["webcapture_count"] == 0
+ assert es["ref_count"] == 2
+ assert es["ref_linked_count"] == 1
+
+ assert es["preservation"] == "bright"
+ assert es["is_oa"] is True
+ assert es["is_longtail_oa"] is False
+ assert es["is_preserved"] is True
+ assert es["in_web"] is True
+ assert es["in_dweb"] is True
+ assert es["in_ia"] is True
+ assert es["in_ia_sim"] is False
+ assert es["in_kbart"] is True
+ assert es["in_jstor"] is True
+
def test_elasticsearch_release_from_json():
- r = entity_from_json(open('./tests/files/release_etodop5banbndg3faecnfm6ozi.json', 'r').read(), ReleaseEntity)
+ r = entity_from_json(
+ open("./tests/files/release_etodop5banbndg3faecnfm6ozi.json", "r").read(), ReleaseEntity
+ )
es = release_to_elasticsearch(r)
- assert es['subtitle'] == "Correpondence"
- assert es['ident'] == "etodop5banbndg3faecnfm6ozi"
- assert es['container_name'] == "BJOG: an International Journal of Obstetrics and Gynaecology"
- assert es['first_page'] == "1404"
- assert es['issue'] == "11"
- assert es['volume'] == "118"
- assert es['number'] is None
-
- assert es['preservation'] == "dark"
- assert es['is_oa'] is False
- assert es['is_longtail_oa'] is False
- assert es['is_preserved'] is True
- assert es['in_web'] is False
- assert es['in_dweb'] is False
- assert es['in_ia'] is False
- assert es['in_ia_sim'] is True
- assert es['in_kbart'] is True
- assert es['in_jstor'] is False
+ assert es["subtitle"] == "Correpondence"
+ assert es["ident"] == "etodop5banbndg3faecnfm6ozi"
+ assert (
+ es["container_name"] == "BJOG: an International Journal of Obstetrics and Gynaecology"
+ )
+ assert es["first_page"] == "1404"
+ assert es["issue"] == "11"
+ assert es["volume"] == "118"
+ assert es["number"] is None
+
+ assert es["preservation"] == "dark"
+ assert es["is_oa"] is False
+ assert es["is_longtail_oa"] is False
+ assert es["is_preserved"] is True
+ assert es["in_web"] is False
+ assert es["in_dweb"] is False
+ assert es["in_ia"] is False
+ assert es["in_ia_sim"] is True
+ assert es["in_kbart"] is True
+ assert es["in_jstor"] is False
# this release has a fileset, and no file
- r = entity_from_json(open('./tests/files/release_3mssw2qnlnblbk7oqyv2dafgey.json', 'r').read(), ReleaseEntity)
+ r = entity_from_json(
+ open("./tests/files/release_3mssw2qnlnblbk7oqyv2dafgey.json", "r").read(), ReleaseEntity
+ )
es = release_to_elasticsearch(r)
- assert es['title'] == "Jakobshavn Glacier Bed Elevation"
- assert es['ident'] == "3mssw2qnlnblbk7oqyv2dafgey"
- assert es['file_count'] == 0
- assert es['fileset_count'] == 1
- assert es['webcapture_count'] == 0
-
- assert es['preservation'] == "dark"
- assert es['is_oa'] is True
- assert es['is_longtail_oa'] is False
- assert es['is_preserved'] is True
- assert es['in_web'] is True
- assert es['in_dweb'] is True
- assert es['in_ia'] is False
- assert es['in_ia_sim'] is False
- assert es['in_kbart'] is False
- assert es['in_jstor'] is False
+ assert es["title"] == "Jakobshavn Glacier Bed Elevation"
+ assert es["ident"] == "3mssw2qnlnblbk7oqyv2dafgey"
+ assert es["file_count"] == 0
+ assert es["fileset_count"] == 1
+ assert es["webcapture_count"] == 0
+
+ assert es["preservation"] == "dark"
+ assert es["is_oa"] is True
+ assert es["is_longtail_oa"] is False
+ assert es["is_preserved"] is True
+ assert es["in_web"] is True
+ assert es["in_dweb"] is True
+ assert es["in_ia"] is False
+ assert es["in_ia_sim"] is False
+ assert es["in_kbart"] is False
+ assert es["in_jstor"] is False
# this release has a web capture, and no file (edited the JSON to remove file)
- r = entity_from_json(open('./tests/files/release_mjtqtuyhwfdr7j2c3l36uor7uy.json', 'r').read(), ReleaseEntity)
+ r = entity_from_json(
+ open("./tests/files/release_mjtqtuyhwfdr7j2c3l36uor7uy.json", "r").read(), ReleaseEntity
+ )
es = release_to_elasticsearch(r)
- assert es['title'] == "Rethinking Personal Digital Archiving, Part 1"
- assert es['ident'] == "mjtqtuyhwfdr7j2c3l36uor7uy"
- assert es['file_count'] == 0
- assert es['fileset_count'] == 0
- assert es['webcapture_count'] == 1
-
- assert es['preservation'] == "bright"
- assert es['is_oa'] is True
- assert es['is_longtail_oa'] is False
- assert es['is_preserved'] is True
- assert es['in_web'] is True
- assert es['in_dweb'] is False
- assert es['in_ia'] is True
- assert es['in_ia_sim'] is False
- assert es['in_kbart'] is False
- assert es['in_jstor'] is False
+ assert es["title"] == "Rethinking Personal Digital Archiving, Part 1"
+ assert es["ident"] == "mjtqtuyhwfdr7j2c3l36uor7uy"
+ assert es["file_count"] == 0
+ assert es["fileset_count"] == 0
+ assert es["webcapture_count"] == 1
+
+ assert es["preservation"] == "bright"
+ assert es["is_oa"] is True
+ assert es["is_longtail_oa"] is False
+ assert es["is_preserved"] is True
+ assert es["in_web"] is True
+ assert es["in_dweb"] is False
+ assert es["in_ia"] is True
+ assert es["in_ia_sim"] is False
+ assert es["in_kbart"] is False
+ assert es["in_jstor"] is False
+
def test_elasticsearch_container_transform(journal_metadata_importer):
- with open('tests/files/journal_metadata.sample.json', 'r') as f:
+ with open("tests/files/journal_metadata.sample.json", "r") as f:
raw1 = json.loads(f.readline())
raw2 = json.loads(f.readline())
c1 = journal_metadata_importer.parse_record(raw1)
- c1.state = 'active'
+ c1.state = "active"
c2 = journal_metadata_importer.parse_record(raw2)
- c2.state = 'active'
+ c2.state = "active"
- c1.extra['publisher_type'] = "big5"
- c1.extra['discipline'] = "history"
+ c1.extra["publisher_type"] = "big5"
+ c1.extra["discipline"] = "history"
es = container_to_elasticsearch(c1)
- assert es['publisher'] == c1.publisher
- assert es['discipline'] == c1.extra['discipline']
- assert es['publisher_type'] == c1.extra['publisher_type']
- assert es['keepers'] == []
+ assert es["publisher"] == c1.publisher
+ assert es["discipline"] == c1.extra["discipline"]
+ assert es["publisher_type"] == c1.extra["publisher_type"]
+ assert es["keepers"] == []
stats = {
"ident": "en4qj5ijrbf5djxx7p5zzpjyoq",
@@ -186,71 +202,70 @@ def test_elasticsearch_container_transform(journal_metadata_importer):
"dark": 1635,
"none": 0,
"shadows_only": 0,
- "total": 11136
+ "total": 11136,
},
- "release_type": {
- "_unknown": 9,
- "article-journal": 11124,
- "editorial": 2,
- "letter": 1
- },
- "total": 11136
+ "release_type": {"_unknown": 9, "article-journal": 11124, "editorial": 2, "letter": 1},
+ "total": 11136,
}
es = container_to_elasticsearch(c2, stats=stats)
- assert es['name'] == c2.name
- assert es['publisher'] == c2.publisher
- assert es['keepers'] == list(c2.extra['kbart'].keys()) == ["portico"]
- assert es['any_kbart'] is True
+ assert es["name"] == c2.name
+ assert es["publisher"] == c2.publisher
+ assert es["keepers"] == list(c2.extra["kbart"].keys()) == ["portico"]
+ assert es["any_kbart"] is True
def test_elasticsearch_file_transform():
- with open('./tests/files/file_bcah4zp5tvdhjl5bqci2c2lgfa.json', 'r') as f:
+ with open("./tests/files/file_bcah4zp5tvdhjl5bqci2c2lgfa.json", "r") as f:
json_str = f.read()
fe = entity_from_json(json_str, FileEntity)
- fe.state = 'active'
+ fe.state = "active"
es = file_to_elasticsearch(fe)
# pylint infers type of 'fe' incorrectly for some reason (as str/bytes)
- assert es['sha1'] == fe.sha1 # pylint: disable=no-member
- assert es['sha256'] == fe.sha256 # pylint: disable=no-member
- assert es['md5'] == fe.md5 # pylint: disable=no-member
- assert es['size_bytes'] == fe.size # pylint: disable=no-member
- assert es['mimetype'] == fe.mimetype # pylint: disable=no-member
- assert es['in_ia'] is True
-
- assert 'web' in es['rels']
- assert 'www.zhros.ru' in es['hosts']
- assert 'zhros.ru' in es['domains']
- assert 'archive.org' in (es['hosts'] + es['domains'])
- assert 'web.archive.org' in (es['hosts'] + es['domains'])
+ assert es["sha1"] == fe.sha1 # pylint: disable=no-member
+ assert es["sha256"] == fe.sha256 # pylint: disable=no-member
+ assert es["md5"] == fe.md5 # pylint: disable=no-member
+ assert es["size_bytes"] == fe.size # pylint: disable=no-member
+ assert es["mimetype"] == fe.mimetype # pylint: disable=no-member
+ assert es["in_ia"] is True
+
+ assert "web" in es["rels"]
+ assert "www.zhros.ru" in es["hosts"]
+ assert "zhros.ru" in es["domains"]
+ assert "archive.org" in (es["hosts"] + es["domains"])
+ assert "web.archive.org" in (es["hosts"] + es["domains"])
# old regression
- assert '.archive.org' not in (es['hosts'] + es['domains'])
+ assert ".archive.org" not in (es["hosts"] + es["domains"])
+
def test_elasticsearch_changelog_transform():
- ce = entity_from_json(open('./tests/files/changelog_3469683.json', 'r').read(), ChangelogEntry)
+ ce = entity_from_json(
+ open("./tests/files/changelog_3469683.json", "r").read(), ChangelogEntry
+ )
es = changelog_to_elasticsearch(ce)
- assert es['index'] == 3469683
+ assert es["index"] == 3469683
# len("2020-01-30T05:04:39") => 19
- assert es['timestamp'][:19] == "2020-01-30T05:04:39.738601Z"[:19]
- assert es['editor_id'] == "scmbogxw25evtcesfcab5qaboa"
- assert es['username'] == "crawl-bot"
- assert es['is_bot'] is True
- assert es['is_admin'] is True
- assert es['agent'] == "fatcat_tools.IngestFileResultImporter"
-
- assert es['total'] == 50
- assert es['files'] == 50
- assert es['new_files'] == 50
- assert es['created'] == 50
-
- assert es['releases'] == 0
- assert es['new_releases'] == 0
- assert es['updated'] == 0
- assert es['deleted'] == 0
+ assert es["timestamp"][:19] == "2020-01-30T05:04:39.738601Z"[:19]
+ assert es["editor_id"] == "scmbogxw25evtcesfcab5qaboa"
+ assert es["username"] == "crawl-bot"
+ assert es["is_bot"] is True
+ assert es["is_admin"] is True
+ assert es["agent"] == "fatcat_tools.IngestFileResultImporter"
+
+ assert es["total"] == 50
+ assert es["files"] == 50
+ assert es["new_files"] == 50
+ assert es["created"] == 50
+
+ assert es["releases"] == 0
+ assert es["new_releases"] == 0
+ assert es["updated"] == 0
+ assert es["deleted"] == 0
+
def test_elasticsearch_release_kbart_year():
this_year = datetime.date.today().year
@@ -264,7 +279,7 @@ def test_elasticsearch_release_kbart_year():
ReleaseRef(target_release_id="iznnn644szdwva7khyxqzc73bi"),
],
)
- r.state = 'active'
+ r.state = "active"
r.container = ContainerEntity(
name="dummy journal",
extra={
@@ -276,18 +291,18 @@ def test_elasticsearch_release_kbart_year():
},
)
es = release_to_elasticsearch(r)
- assert es['release_year'] == this_year
-
- assert es['preservation'] == "none"
- assert es['is_oa'] is True
- assert es['is_longtail_oa'] is False
- assert es['is_preserved'] is False
- assert es['in_web'] is False
- assert es['in_dweb'] is False
- assert es['in_ia'] is False
- assert es['in_ia_sim'] is False
- assert es['in_kbart'] is False
- assert es['in_jstor'] is False
+ assert es["release_year"] == this_year
+
+ assert es["preservation"] == "none"
+ assert es["is_oa"] is True
+ assert es["is_longtail_oa"] is False
+ assert es["is_preserved"] is False
+ assert es["in_web"] is False
+ assert es["in_dweb"] is False
+ assert es["in_ia"] is False
+ assert es["in_ia_sim"] is False
+ assert es["in_kbart"] is False
+ assert es["in_jstor"] is False
r.container = ContainerEntity(
name="dummy journal",
@@ -300,15 +315,15 @@ def test_elasticsearch_release_kbart_year():
},
)
es = release_to_elasticsearch(r)
- assert es['release_year'] == this_year
-
- assert es['preservation'] == "dark"
- assert es['is_oa'] is True
- assert es['is_longtail_oa'] is False
- assert es['is_preserved'] is True
- assert es['in_web'] is False
- assert es['in_dweb'] is False
- assert es['in_ia'] is False
- assert es['in_ia_sim'] is False
- assert es['in_kbart'] is True
- assert es['in_jstor'] is False
+ assert es["release_year"] == this_year
+
+ assert es["preservation"] == "dark"
+ assert es["is_oa"] is True
+ assert es["is_longtail_oa"] is False
+ assert es["is_preserved"] is True
+ assert es["in_web"] is False
+ assert es["in_dweb"] is False
+ assert es["in_ia"] is False
+ assert es["in_ia_sim"] is False
+ assert es["in_kbart"] is True
+ assert es["in_jstor"] is False