From e16672c4c21e17c2d2c653e7d480f4ba671771fb Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 5 Nov 2020 23:04:24 -0800 Subject: ingest: progress on HTML ingest --- python/tests/import_ingest.py | 45 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 43 insertions(+), 2 deletions(-) (limited to 'python/tests/import_ingest.py') diff --git a/python/tests/import_ingest.py b/python/tests/import_ingest.py index 21552fb9..92539f1a 100644 --- a/python/tests/import_ingest.py +++ b/python/tests/import_ingest.py @@ -2,7 +2,7 @@ import json import pytest -from fatcat_tools.importers import IngestFileResultImporter, JsonLinePusher +from fatcat_tools.importers import IngestFileResultImporter, IngestWebResultImporter, JsonLinePusher from fixtures import * @@ -10,6 +10,10 @@ from fixtures import * def ingest_importer(api): yield IngestFileResultImporter(api) +@pytest.fixture(scope="function") +def ingest_web_importer(api): + yield IngestWebResultImporter(api) + # TODO: use API to check that entities actually created... def test_ingest_importer_basic(ingest_importer): with open('tests/files/example_ingest.json', 'r') as f: @@ -46,6 +50,7 @@ def test_ingest_importer_xml(ingest_importer): with open('tests/files/example_ingest_xml.json', 'r') as f: ingest_importer.bezerk_mode = True counts = JsonLinePusher(ingest_importer, f).run() + print(counts) assert counts['insert'] == 1 assert counts['exists'] == 0 assert counts['skip'] == 0 @@ -58,6 +63,42 @@ def test_ingest_importer_xml(ingest_importer): assert eg.extra['git_rev'] assert "fatcat_tools.IngestFileResultImporter" in eg.extra['agent'] + # re-import should skip + with open('tests/files/example_ingest_xml.json', 'r') as f: + ingest_importer.reset() + ingest_importer.bezerk_mode = False + counts = JsonLinePusher(ingest_importer, f).run() + assert counts['insert'] == 0 + assert counts['exists'] == 1 + assert counts['skip'] == 0 + +def test_ingest_importer_web(ingest_web_importer): + last_index = ingest_web_importer.api.get_changelog(limit=1)[0].index + with open('tests/files/example_ingest_html.json', 'r') as f: + ingest_web_importer.bezerk_mode = True + counts = JsonLinePusher(ingest_web_importer, f).run() + print(counts) + assert counts['insert'] == 1 + assert counts['exists'] == 0 + assert counts['skip'] == 0 + + # fetch most recent editgroup + change = ingest_web_importer.api.get_changelog_entry(index=last_index+1) + eg = change.editgroup + assert eg.description + assert "crawled from web" in eg.description.lower() + assert eg.extra['git_rev'] + assert "fatcat_tools.IngestWebResultImporter" in eg.extra['agent'] + + # re-import should skip + with open('tests/files/example_ingest_html.json', 'r') as f: + ingest_web_importer.reset() + ingest_web_importer.bezerk_mode = False + counts = JsonLinePusher(ingest_web_importer, f).run() + assert counts['insert'] == 0 + assert counts['exists'] == 1 + assert counts['skip'] == 0 + def test_ingest_importer_stage(ingest_importer, api): """ Tests that ingest importer correctly handles release stage matching @@ -74,7 +115,7 @@ def test_ingest_importer_stage(ingest_importer, api): with open('tests/files/example_ingest.json', 'r') as f: raw = json.loads(f.readline()) for row in test_table: - print(row) + #print(row) # set dummy record stage eg = quick_eg(api) -- cgit v1.2.3