aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2020-11-05 17:29:44 -0800
committerBryan Newbold <bnewbold@robocracy.org>2020-11-05 17:29:44 -0800
commitcad812bf78f7363e698139cd7b95d7434f8ae4bb (patch)
tree3896497c20ff0b257fe425cf56b33a3e79814aee
parent8b6b1447cc37fb76865fd80377c55463e59db3b9 (diff)
downloadfatcat-cad812bf78f7363e698139cd7b95d7434f8ae4bb.tar.gz
fatcat-cad812bf78f7363e698139cd7b95d7434f8ae4bb.zip
ingest: tests for basic XML ingest
-rw-r--r--python/tests/files/example_ingest_xml.json1
-rw-r--r--python/tests/import_ingest.py17
2 files changed, 18 insertions, 0 deletions
diff --git a/python/tests/files/example_ingest_xml.json b/python/tests/files/example_ingest_xml.json
new file mode 100644
index 00000000..ba61b183
--- /dev/null
+++ b/python/tests/files/example_ingest_xml.json
@@ -0,0 +1 @@
+{"cdx": {"datetime": "20200710091403", "mimetype": "text/xml", "sha1b32": "PWMQ2L4RHPJ3NVWC66GIJC36L5FXPOM6", "sha1hex": "7d990d2f913bd3b6d6c2f78c848b7e5f4b77b99e", "status_code": 200, "surt": "py,una,iics,scielo)/scieloorg/php/articlexml.php?lang=en&pid=s1683-98032015000200002", "url": "http://scielo.iics.una.py/scieloOrg/php/articleXML.php?pid=S1683-98032015000200002&lang=en", "warc_csize": 12108, "warc_offset": 94730348, "warc_path": "SCIELO-CRAWL-2020-07-20200710082036515-00773-00843-wbgrp-svc206/SCIELO-CRAWL-2020-07-20200710085423121-00779-13069~wbgrp-svc206.us.archive.org~8443.warc.gz"}, "file_meta": {"md5hex": "cda133a706ce02a07fae8bd8d2694a2a", "mimetype": "application/jats+xml", "sha1hex": "7d990d2f913bd3b6d6c2f78c848b7e5f4b77b99e", "sha256hex": "be982ca211e4debb3f93f36d9f9dc1c80f99a8809eb4c41569b2b9503c27e751", "size_bytes": 49242}, "hit": true, "hops": ["http://scielo.iics.una.py/scielo.php?script=sci_abstract&pid=S1683-98032015000200002&lng=en&nrm=iso&tlng=en", "http://scielo.iics.una.py/scieloOrg/php/articleXML.php?pid=S1683-98032015000200002&lang=en"], "request": {"ingest_request_source": "fatcat-changelog","base_url": "http://scielo.iics.una.py/scielo.php?script=sci_abstract&pid=S1683-98032015000200002&lng=en&nrm=iso&tlng=en", "ext_ids": {"doi": "10.123/abc"}, "fatcat": {"release_ident": null}, "ingest_type": "xml"}, "status": "success", "terminal": {"terminal_dt": "20200710091403", "terminal_sha1hex": "7d990d2f913bd3b6d6c2f78c848b7e5f4b77b99e", "terminal_status_code": 200, "terminal_url": "http://scielo.iics.una.py/scieloOrg/php/articleXML.php?pid=S1683-98032015000200002&lang=en"}, "xml_meta": {"status": "success"}}
diff --git a/python/tests/import_ingest.py b/python/tests/import_ingest.py
index 05287af4..21552fb9 100644
--- a/python/tests/import_ingest.py
+++ b/python/tests/import_ingest.py
@@ -41,6 +41,23 @@ def test_ingest_importer(ingest_importer):
assert counts['exists'] == 1
assert counts['skip'] == 1
+def test_ingest_importer_xml(ingest_importer):
+ last_index = ingest_importer.api.get_changelog(limit=1)[0].index
+ with open('tests/files/example_ingest_xml.json', 'r') as f:
+ ingest_importer.bezerk_mode = True
+ counts = JsonLinePusher(ingest_importer, f).run()
+ assert counts['insert'] == 1
+ assert counts['exists'] == 0
+ assert counts['skip'] == 0
+
+ # fetch most recent editgroup
+ change = ingest_importer.api.get_changelog_entry(index=last_index+1)
+ eg = change.editgroup
+ assert eg.description
+ assert "crawled from web" in eg.description.lower()
+ assert eg.extra['git_rev']
+ assert "fatcat_tools.IngestFileResultImporter" in eg.extra['agent']
+
def test_ingest_importer_stage(ingest_importer, api):
"""
Tests that ingest importer correctly handles release stage matching