aboutsummaryrefslogtreecommitdiffstats
path: root/mapreduce
diff options
context:
space:
mode:
Diffstat (limited to 'mapreduce')
-rw-r--r--mapreduce/tests/test_extraction_cdx_grobid.py59
-rw-r--r--mapreduce/tests/test_grobid2json.py8
2 files changed, 51 insertions, 16 deletions
diff --git a/mapreduce/tests/test_extraction_cdx_grobid.py b/mapreduce/tests/test_extraction_cdx_grobid.py
index 02d2b41..713e501 100644
--- a/mapreduce/tests/test_extraction_cdx_grobid.py
+++ b/mapreduce/tests/test_extraction_cdx_grobid.py
@@ -35,7 +35,7 @@ def test_mapper_lines(mock_fetch, job):
with open('tests/files/23b29ea36382680716be08fc71aa81bd226e8a85.xml', 'r') as f:
real_tei_xml = f.read()
responses.add(responses.POST, 'http://localhost:8070/api/processFulltextDocument', status=200,
- body=real_tei_xml, content_type='application/json')
+ body=real_tei_xml, content_type='text/xml')
raw = io.BytesIO(b"""
com,sagepub,cep)/content/28/9/960.full.pdf 20170705062200 http://cep.sagepub.com/content/28/9/960.full.pdf application/pdf 301 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 401 313356621 CITESEERX-CRAWL-2017-06-20-20170705061647307-00039-00048-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705062052659-00043-31209~wbgrp-svc284.us.archive.org~8443.warc.gz
@@ -53,15 +53,15 @@ com,pbworks,educ333b)/robots.txt 20170705063311 http://educ333b.pbworks.com/robo
#print(list(job.hb_table.scan()))
# wayback gets FETCH 1x times
- # TODO:
-
- # grobid gets POST 3x times
- # TODO:
-
- # hbase
- # TODO:
+ mock_fetch.assert_called_once_with(
+ "CITESEERX-CRAWL-2017-06-20-20170705061647307-00039-00048-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705062052659-00043-31209~wbgrp-svc284.us.archive.org~8443.warc.gz",
+ 328850624,
+ 854156)
+ # grobid gets POST 1x times
+ assert len(responses.calls) == 1
+ # HBase
assert job.hb_table.row(b'1') == {}
# HTTP 301
assert job.hb_table.row(b'sha1:3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ') == {}
@@ -70,6 +70,7 @@ com,pbworks,educ333b)/robots.txt 20170705063311 http://educ333b.pbworks.com/robo
# text/plain
assert job.hb_table.row(b'sha1:6VAUYENMOU2SK2OWNRPDD6WTQTECGZAD') == {}
+ # Saved extraction info
row = job.hb_table.row(b'sha1:MPCXVWMUTRUGFP36SLPHKDLY6NGU4S3J')
assert struct.unpack("!q", row[b'file:size'])[0] == len(FAKE_PDF_BYTES)
@@ -152,12 +153,38 @@ def test_parse_cdx_skip(job):
assert status['status'] == "skip"
assert 'mimetype' in status['reason']
-def test_tei_json_convert():
- # TODO: load xml test vector, run it through
- with open('tests/files/23b29ea36382680716be08fc71aa81bd226e8a85.xml', 'r') as f:
- xml_content = f.read()
- pass
-def test_tei_json_convert_invalid():
- # TODO: pass in junk
- pass
+@mock.patch('extraction_cdx_grobid.MRExtractCdxGrobid.fetch_warc_content', return_value=(FAKE_PDF_BYTES, None))
+@responses.activate
+def test_grobid_503(mock_fetch, job):
+
+ status = b"{'status': 'done broke due to 503'}"
+ responses.add(responses.POST, 'http://localhost:8070/api/processFulltextDocument', status=503,
+ body=status)
+
+ raw = io.BytesIO(b"""com,sagepub,cep)/content/28/9/960.full.pdf 20170705062200 http://cep.sagepub.com/content/28/9/960.full.pdf application/pdf 200 ABCDEF12345Q2MSVX7XZKYAYSCX5QBYJ - - 401 313356621 CITESEERX-CRAWL-2017-06-20-20170705061647307-00039-00048-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705062052659-00043-31209~wbgrp-svc284.us.archive.org~8443.warc.gz""")
+
+ output = io.BytesIO()
+ job.sandbox(stdin=raw, stdout=output)
+ job.run_mapper()
+ row = job.hb_table.row(b'sha1:ABCDEF12345Q2MSVX7XZKYAYSCX5QBYJ')
+ status = json.loads(row[b'grobid0:status'].decode('utf-8'))
+ assert json.loads(row[b'grobid0:status'].decode('utf-8')) == status
+
+
+@mock.patch('extraction_cdx_grobid.MRExtractCdxGrobid.fetch_warc_content', return_value=(FAKE_PDF_BYTES, None))
+@responses.activate
+def test_grobid_not_xml(mock_fetch, job):
+
+ status = b"{'status': 'done broke'}"
+ responses.add(responses.POST, 'http://localhost:8070/api/processFulltextDocument', status=200,
+ body=status)
+
+ raw = io.BytesIO(b"""com,sagepub,cep)/content/28/9/960.full.pdf 20170705062200 http://cep.sagepub.com/content/28/9/960.full.pdf application/pdf 200 ABCDEF12345Q2MSVX7XZKYAYSCX5QBYJ - - 401 313356621 CITESEERX-CRAWL-2017-06-20-20170705061647307-00039-00048-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705062052659-00043-31209~wbgrp-svc284.us.archive.org~8443.warc.gz""")
+
+ output = io.BytesIO()
+ job.sandbox(stdin=raw, stdout=output)
+ job.run_mapper()
+ row = job.hb_table.row(b'sha1:ABCDEF12345Q2MSVX7XZKYAYSCX5QBYJ')
+ assert json.loads(row[b'grobid0:status'].decode('utf-8')) == status
+
diff --git a/mapreduce/tests/test_grobid2json.py b/mapreduce/tests/test_grobid2json.py
index 40853dd..1562006 100644
--- a/mapreduce/tests/test_grobid2json.py
+++ b/mapreduce/tests/test_grobid2json.py
@@ -1,4 +1,5 @@
+import xml
import json
import pytest
from grobid2json import *
@@ -12,3 +13,10 @@ def test_small_xml():
json_form = json.loads(f.read())
assert teixml2json(tei_xml) == json_form
+
+def test_invalid_xml():
+
+ with pytest.raises(xml.etree.ElementTree.ParseError):
+ teixml2json("this is not XML")
+ with pytest.raises(xml.etree.ElementTree.ParseError):
+ teixml2json("<xml></xml>")