From cad812bf78f7363e698139cd7b95d7434f8ae4bb Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 5 Nov 2020 17:29:44 -0800 Subject: ingest: tests for basic XML ingest --- python/tests/files/example_ingest_xml.json | 1 + python/tests/import_ingest.py | 17 +++++++++++++++++ 2 files changed, 18 insertions(+) create mode 100644 python/tests/files/example_ingest_xml.json (limited to 'python/tests') diff --git a/python/tests/files/example_ingest_xml.json b/python/tests/files/example_ingest_xml.json new file mode 100644 index 00000000..ba61b183 --- /dev/null +++ b/python/tests/files/example_ingest_xml.json @@ -0,0 +1 @@ +{"cdx": {"datetime": "20200710091403", "mimetype": "text/xml", "sha1b32": "PWMQ2L4RHPJ3NVWC66GIJC36L5FXPOM6", "sha1hex": "7d990d2f913bd3b6d6c2f78c848b7e5f4b77b99e", "status_code": 200, "surt": "py,una,iics,scielo)/scieloorg/php/articlexml.php?lang=en&pid=s1683-98032015000200002", "url": "http://scielo.iics.una.py/scieloOrg/php/articleXML.php?pid=S1683-98032015000200002&lang=en", "warc_csize": 12108, "warc_offset": 94730348, "warc_path": "SCIELO-CRAWL-2020-07-20200710082036515-00773-00843-wbgrp-svc206/SCIELO-CRAWL-2020-07-20200710085423121-00779-13069~wbgrp-svc206.us.archive.org~8443.warc.gz"}, "file_meta": {"md5hex": "cda133a706ce02a07fae8bd8d2694a2a", "mimetype": "application/jats+xml", "sha1hex": "7d990d2f913bd3b6d6c2f78c848b7e5f4b77b99e", "sha256hex": "be982ca211e4debb3f93f36d9f9dc1c80f99a8809eb4c41569b2b9503c27e751", "size_bytes": 49242}, "hit": true, "hops": ["http://scielo.iics.una.py/scielo.php?script=sci_abstract&pid=S1683-98032015000200002&lng=en&nrm=iso&tlng=en", "http://scielo.iics.una.py/scieloOrg/php/articleXML.php?pid=S1683-98032015000200002&lang=en"], "request": {"ingest_request_source": "fatcat-changelog","base_url": "http://scielo.iics.una.py/scielo.php?script=sci_abstract&pid=S1683-98032015000200002&lng=en&nrm=iso&tlng=en", "ext_ids": {"doi": "10.123/abc"}, "fatcat": {"release_ident": null}, "ingest_type": "xml"}, "status": "success", "terminal": {"terminal_dt": "20200710091403", "terminal_sha1hex": "7d990d2f913bd3b6d6c2f78c848b7e5f4b77b99e", "terminal_status_code": 200, "terminal_url": "http://scielo.iics.una.py/scieloOrg/php/articleXML.php?pid=S1683-98032015000200002&lang=en"}, "xml_meta": {"status": "success"}} diff --git a/python/tests/import_ingest.py b/python/tests/import_ingest.py index 05287af4..21552fb9 100644 --- a/python/tests/import_ingest.py +++ b/python/tests/import_ingest.py @@ -41,6 +41,23 @@ def test_ingest_importer(ingest_importer): assert counts['exists'] == 1 assert counts['skip'] == 1 +def test_ingest_importer_xml(ingest_importer): + last_index = ingest_importer.api.get_changelog(limit=1)[0].index + with open('tests/files/example_ingest_xml.json', 'r') as f: + ingest_importer.bezerk_mode = True + counts = JsonLinePusher(ingest_importer, f).run() + assert counts['insert'] == 1 + assert counts['exists'] == 0 + assert counts['skip'] == 0 + + # fetch most recent editgroup + change = ingest_importer.api.get_changelog_entry(index=last_index+1) + eg = change.editgroup + assert eg.description + assert "crawled from web" in eg.description.lower() + assert eg.extra['git_rev'] + assert "fatcat_tools.IngestFileResultImporter" in eg.extra['agent'] + def test_ingest_importer_stage(ingest_importer, api): """ Tests that ingest importer correctly handles release stage matching -- cgit v1.2.3