diff options
| author | Bryan Newbold <bnewbold@robocracy.org> | 2021-11-02 18:13:14 -0700 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@robocracy.org> | 2021-11-02 18:13:14 -0700 | 
| commit | cdfd6b85b386b7bbf9d5a5179ef26970b6e5a4e7 (patch) | |
| tree | 5e4034027b51f3ee4d2a488bb2cbb7a75c3bd0d8 /python/tests/import_ingest.py | |
| parent | 78f08280edea4ff65ca613ad30005c45cc48dea6 (diff) | |
| download | fatcat-cdfd6b85b386b7bbf9d5a5179ef26970b6e5a4e7.tar.gz fatcat-cdfd6b85b386b7bbf9d5a5179ef26970b6e5a4e7.zip  | |
fmt (black): tests/
Diffstat (limited to 'python/tests/import_ingest.py')
| -rw-r--r-- | python/tests/import_ingest.py | 93 | 
1 files changed, 50 insertions, 43 deletions
diff --git a/python/tests/import_ingest.py b/python/tests/import_ingest.py index 04a8e7f6..d9e7d294 100644 --- a/python/tests/import_ingest.py +++ b/python/tests/import_ingest.py @@ -1,4 +1,3 @@ -  import json  import pytest @@ -15,94 +14,100 @@ from fatcat_tools.importers import (  def ingest_importer(api):      yield IngestFileResultImporter(api) +  @pytest.fixture(scope="function")  def ingest_web_importer(api):      yield IngestWebResultImporter(api) +  # TODO: use API to check that entities actually created...  def test_ingest_importer_basic(ingest_importer): -    with open('tests/files/example_ingest.json', 'r') as f: +    with open("tests/files/example_ingest.json", "r") as f:          JsonLinePusher(ingest_importer, f).run() +  def test_ingest_importer(ingest_importer):      last_index = ingest_importer.api.get_changelog(limit=1)[0].index -    with open('tests/files/example_ingest.json', 'r') as f: +    with open("tests/files/example_ingest.json", "r") as f:          ingest_importer.bezerk_mode = True          counts = JsonLinePusher(ingest_importer, f).run() -    assert counts['insert'] == 1 -    assert counts['exists'] == 0 -    assert counts['skip'] == 1 +    assert counts["insert"] == 1 +    assert counts["exists"] == 0 +    assert counts["skip"] == 1      # fetch most recent editgroup -    change = ingest_importer.api.get_changelog_entry(index=last_index+1) +    change = ingest_importer.api.get_changelog_entry(index=last_index + 1)      eg = change.editgroup      assert eg.description      assert "crawled from web" in eg.description.lower() -    assert eg.extra['git_rev'] -    assert "fatcat_tools.IngestFileResultImporter" in eg.extra['agent'] +    assert eg.extra["git_rev"] +    assert "fatcat_tools.IngestFileResultImporter" in eg.extra["agent"]      # re-insert; should skip -    with open('tests/files/example_ingest.json', 'r') as f: +    with open("tests/files/example_ingest.json", "r") as f:          ingest_importer.reset()          ingest_importer.bezerk_mode = False          counts = JsonLinePusher(ingest_importer, f).run() -    assert counts['insert'] == 0 -    assert counts['exists'] == 1 -    assert counts['skip'] == 1 +    assert counts["insert"] == 0 +    assert counts["exists"] == 1 +    assert counts["skip"] == 1 +  def test_ingest_importer_xml(ingest_importer):      last_index = ingest_importer.api.get_changelog(limit=1)[0].index -    with open('tests/files/example_ingest_xml.json', 'r') as f: +    with open("tests/files/example_ingest_xml.json", "r") as f:          ingest_importer.bezerk_mode = True          counts = JsonLinePusher(ingest_importer, f).run()      print(counts) -    assert counts['insert'] == 1 -    assert counts['exists'] == 0 -    assert counts['skip'] == 0 +    assert counts["insert"] == 1 +    assert counts["exists"] == 0 +    assert counts["skip"] == 0      # fetch most recent editgroup -    change = ingest_importer.api.get_changelog_entry(index=last_index+1) +    change = ingest_importer.api.get_changelog_entry(index=last_index + 1)      eg = change.editgroup      assert eg.description      assert "crawled from web" in eg.description.lower() -    assert eg.extra['git_rev'] -    assert "fatcat_tools.IngestFileResultImporter" in eg.extra['agent'] +    assert eg.extra["git_rev"] +    assert "fatcat_tools.IngestFileResultImporter" in eg.extra["agent"]      # re-import should skip -    with open('tests/files/example_ingest_xml.json', 'r') as f: +    with open("tests/files/example_ingest_xml.json", "r") as f:          ingest_importer.reset()          ingest_importer.bezerk_mode = False          counts = JsonLinePusher(ingest_importer, f).run() -    assert counts['insert'] == 0 -    assert counts['exists'] == 1 -    assert counts['skip'] == 0 +    assert counts["insert"] == 0 +    assert counts["exists"] == 1 +    assert counts["skip"] == 0 +  def test_ingest_importer_web(ingest_web_importer):      last_index = ingest_web_importer.api.get_changelog(limit=1)[0].index -    with open('tests/files/example_ingest_html.json', 'r') as f: +    with open("tests/files/example_ingest_html.json", "r") as f:          ingest_web_importer.bezerk_mode = True          counts = JsonLinePusher(ingest_web_importer, f).run()      print(counts) -    assert counts['insert'] == 1 -    assert counts['exists'] == 0 -    assert counts['skip'] == 0 +    assert counts["insert"] == 1 +    assert counts["exists"] == 0 +    assert counts["skip"] == 0      # fetch most recent editgroup -    change = ingest_web_importer.api.get_changelog_entry(index=last_index+1) +    change = ingest_web_importer.api.get_changelog_entry(index=last_index + 1)      eg = change.editgroup      assert eg.description      assert "crawled from web" in eg.description.lower() -    assert eg.extra['git_rev'] -    assert "fatcat_tools.IngestWebResultImporter" in eg.extra['agent'] +    assert eg.extra["git_rev"] +    assert "fatcat_tools.IngestWebResultImporter" in eg.extra["agent"]      # re-import should skip -    with open('tests/files/example_ingest_html.json', 'r') as f: +    with open("tests/files/example_ingest_html.json", "r") as f:          ingest_web_importer.reset()          ingest_web_importer.bezerk_mode = False          counts = JsonLinePusher(ingest_web_importer, f).run() -    assert counts['insert'] == 0 -    assert counts['exists'] == 1 -    assert counts['skip'] == 0 +    assert counts["insert"] == 0 +    assert counts["exists"] == 1 +    assert counts["skip"] == 0 +  def test_ingest_importer_stage(ingest_importer, api):      """ @@ -117,29 +122,30 @@ def test_ingest_importer_stage(ingest_importer, api):          dict(request_stage="published", release_stage="draft", status="skip-release-stage"),      ]      ingest_importer.bezerk_mode = True -    with open('tests/files/example_ingest.json', 'r') as f: +    with open("tests/files/example_ingest.json", "r") as f:          raw = json.loads(f.readline())      for row in test_table: -        #print(row) +        # print(row)          # set dummy record stage          eg = quick_eg(api)          r1 = api.lookup_release(doi="10.123/abc") -        r1.release_stage = row['release_stage'] +        r1.release_stage = row["release_stage"]          api.update_release(eg.editgroup_id, r1.ident, r1)          api.accept_editgroup(eg.editgroup_id)          # set ingest request stage -        raw['request']['release_stage'] = row['request_stage'] +        raw["request"]["release_stage"] = row["request_stage"]          ingest_importer.reset()          ingest_importer.push_record(raw)          counts = ingest_importer.finish()          print(counts)          assert counts["total"] == 1 -        assert counts[row['status']] == 1 +        assert counts[row["status"]] == 1 +  def test_ingest_dict_parse(ingest_importer): -    with open('tests/files/example_ingest.json', 'r') as f: +    with open("tests/files/example_ingest.json", "r") as f:          raw = json.loads(f.readline())          f = ingest_importer.parse_record(raw)          assert f.sha1 == "00242a192acc258bdfdb151943419437f440c313" @@ -154,14 +160,15 @@ def test_ingest_dict_parse(ingest_importer):                  assert u.url.startswith("https://web.archive.org/")          assert len(f.release_ids) == 1 +  def test_ingest_dict_parse_old(ingest_importer): -    with open('tests/files/example_ingest.old.json', 'r') as f: +    with open("tests/files/example_ingest.old.json", "r") as f:          raw = json.loads(f.readline())          # ancient ingest requests had no type; skip them          f = ingest_importer.parse_record(raw)          assert f is None -        raw['request']['ingest_type'] = 'pdf' +        raw["request"]["ingest_type"] = "pdf"          f = ingest_importer.parse_record(raw)          assert f.sha1 == "00242a192acc258bdfdb151943419437f440c313"  | 
