diff options
author | Bryan Newbold <bnewbold@archive.org> | 2019-09-25 18:02:43 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2019-09-25 18:02:43 -0700 |
commit | bd508b4ffbab3ea56134f753a9746ff9d582fde3 (patch) | |
tree | b7b0163b73069fd4c05265f64139bff3c03cf348 | |
parent | 716483103dd7fdfe7aab2982c51abae6d3f4271b (diff) | |
download | sandcrawler-bd508b4ffbab3ea56134f753a9746ff9d582fde3.tar.gz sandcrawler-bd508b4ffbab3ea56134f753a9746ff9d582fde3.zip |
test of GROBID client
-rw-r--r-- | python/tests/test_grobid.py | 53 |
1 files changed, 53 insertions, 0 deletions
diff --git a/python/tests/test_grobid.py b/python/tests/test_grobid.py new file mode 100644 index 0000000..fca234a --- /dev/null +++ b/python/tests/test_grobid.py @@ -0,0 +1,53 @@ + +import pytest +import struct +import responses + +from sandcrawler import GrobidClient + + +FAKE_PDF_BYTES = b"%PDF SOME JUNK" + struct.pack("!q", 112853843) + +with open('tests/files/23b29ea36382680716be08fc71aa81bd226e8a85.xml', 'rb') as f: + REAL_TEI_XML = f.read() + +@responses.activate +def test_grobid_503(): + + client = GrobidClient(host_url="http://localhost:8070") + + status = b'{"status": "done broke due to 503"}' + responses.add(responses.POST, + 'http://localhost:8070/api/processFulltextDocument', status=503, + body=status) + + resp = client.process_fulltext(FAKE_PDF_BYTES) + + # grobid gets POST 1x times + assert len(responses.calls) == 1 + + assert resp['status_code'] == 503 + assert resp['status'] == "error" + print(resp) + assert False + +@responses.activate +def test_grobid_503(): + + client = GrobidClient(host_url="http://localhost:8070") + + responses.add(responses.POST, + 'http://localhost:8070/api/processFulltextDocument', status=200, + body=REAL_TEI_XML, content_type='text/xml') + + resp = client.process_fulltext(FAKE_PDF_BYTES) + + # grobid gets POST 1x times + assert len(responses.calls) == 1 + + assert resp['status_code'] == 200 + assert resp['status'] == "success" + print(type(resp['tei_xml'])) + print(type(REAL_TEI_XML)) + assert resp['tei_xml'] == REAL_TEI_XML.decode('utf-8') + #assert resp['tei_xml'].split('\n')[:3] == REAL_TEI_XML.split('\n')[:3] |