diff options
author | Martin Czygan <martin@archive.org> | 2020-11-19 22:36:55 +0000 |
---|---|---|
committer | Martin Czygan <martin@archive.org> | 2020-11-19 22:36:55 +0000 |
commit | 03eadfc7e2bee4213345f6464378e87b8f741d20 (patch) | |
tree | 3e5b13af8ba46b240f9ae53d5f522fb7ee02c219 /python/tests/import_ingest.py | |
parent | 5afde4690a4653db53fe4962af5da3eb9188d9a2 (diff) | |
parent | a73b73c2944b3df2a62886c4e6b69c93f5e74222 (diff) | |
download | fatcat-03eadfc7e2bee4213345f6464378e87b8f741d20.tar.gz fatcat-03eadfc7e2bee4213345f6464378e87b8f741d20.zip |
Merge branch 'bnewbold-xml-html-ingest' into 'master'
HTML webcapture ingest (and XML file ingest)
See merge request webgroup/fatcat!88
Diffstat (limited to 'python/tests/import_ingest.py')
-rw-r--r-- | python/tests/import_ingest.py | 68 |
1 files changed, 66 insertions, 2 deletions
diff --git a/python/tests/import_ingest.py b/python/tests/import_ingest.py index 4a46232a..92539f1a 100644 --- a/python/tests/import_ingest.py +++ b/python/tests/import_ingest.py @@ -2,7 +2,7 @@ import json import pytest -from fatcat_tools.importers import IngestFileResultImporter, JsonLinePusher +from fatcat_tools.importers import IngestFileResultImporter, IngestWebResultImporter, JsonLinePusher from fixtures import * @@ -10,6 +10,10 @@ from fixtures import * def ingest_importer(api): yield IngestFileResultImporter(api) +@pytest.fixture(scope="function") +def ingest_web_importer(api): + yield IngestWebResultImporter(api) + # TODO: use API to check that entities actually created... def test_ingest_importer_basic(ingest_importer): with open('tests/files/example_ingest.json', 'r') as f: @@ -41,6 +45,60 @@ def test_ingest_importer(ingest_importer): assert counts['exists'] == 1 assert counts['skip'] == 1 +def test_ingest_importer_xml(ingest_importer): + last_index = ingest_importer.api.get_changelog(limit=1)[0].index + with open('tests/files/example_ingest_xml.json', 'r') as f: + ingest_importer.bezerk_mode = True + counts = JsonLinePusher(ingest_importer, f).run() + print(counts) + assert counts['insert'] == 1 + assert counts['exists'] == 0 + assert counts['skip'] == 0 + + # fetch most recent editgroup + change = ingest_importer.api.get_changelog_entry(index=last_index+1) + eg = change.editgroup + assert eg.description + assert "crawled from web" in eg.description.lower() + assert eg.extra['git_rev'] + assert "fatcat_tools.IngestFileResultImporter" in eg.extra['agent'] + + # re-import should skip + with open('tests/files/example_ingest_xml.json', 'r') as f: + ingest_importer.reset() + ingest_importer.bezerk_mode = False + counts = JsonLinePusher(ingest_importer, f).run() + assert counts['insert'] == 0 + assert counts['exists'] == 1 + assert counts['skip'] == 0 + +def test_ingest_importer_web(ingest_web_importer): + last_index = ingest_web_importer.api.get_changelog(limit=1)[0].index + with open('tests/files/example_ingest_html.json', 'r') as f: + ingest_web_importer.bezerk_mode = True + counts = JsonLinePusher(ingest_web_importer, f).run() + print(counts) + assert counts['insert'] == 1 + assert counts['exists'] == 0 + assert counts['skip'] == 0 + + # fetch most recent editgroup + change = ingest_web_importer.api.get_changelog_entry(index=last_index+1) + eg = change.editgroup + assert eg.description + assert "crawled from web" in eg.description.lower() + assert eg.extra['git_rev'] + assert "fatcat_tools.IngestWebResultImporter" in eg.extra['agent'] + + # re-import should skip + with open('tests/files/example_ingest_html.json', 'r') as f: + ingest_web_importer.reset() + ingest_web_importer.bezerk_mode = False + counts = JsonLinePusher(ingest_web_importer, f).run() + assert counts['insert'] == 0 + assert counts['exists'] == 1 + assert counts['skip'] == 0 + def test_ingest_importer_stage(ingest_importer, api): """ Tests that ingest importer correctly handles release stage matching @@ -57,7 +115,7 @@ def test_ingest_importer_stage(ingest_importer, api): with open('tests/files/example_ingest.json', 'r') as f: raw = json.loads(f.readline()) for row in test_table: - print(row) + #print(row) # set dummy record stage eg = quick_eg(api) @@ -94,6 +152,12 @@ def test_ingest_dict_parse(ingest_importer): def test_ingest_dict_parse_old(ingest_importer): with open('tests/files/example_ingest.old.json', 'r') as f: raw = json.loads(f.readline()) + + # ancient ingest requests had no type; skip them + f = ingest_importer.parse_record(raw) + assert f == None + raw['request']['ingest_type'] = 'pdf' + f = ingest_importer.parse_record(raw) assert f.sha1 == "00242a192acc258bdfdb151943419437f440c313" assert f.md5 == "f4de91152c7ab9fdc2a128f962faebff" |