From bd508b4ffbab3ea56134f753a9746ff9d582fde3 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 25 Sep 2019 18:02:43 -0700 Subject: test of GROBID client --- python/tests/test_grobid.py | 53 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 python/tests/test_grobid.py diff --git a/python/tests/test_grobid.py b/python/tests/test_grobid.py new file mode 100644 index 0000000..fca234a --- /dev/null +++ b/python/tests/test_grobid.py @@ -0,0 +1,53 @@ + +import pytest +import struct +import responses + +from sandcrawler import GrobidClient + + +FAKE_PDF_BYTES = b"%PDF SOME JUNK" + struct.pack("!q", 112853843) + +with open('tests/files/23b29ea36382680716be08fc71aa81bd226e8a85.xml', 'rb') as f: + REAL_TEI_XML = f.read() + +@responses.activate +def test_grobid_503(): + + client = GrobidClient(host_url="http://localhost:8070") + + status = b'{"status": "done broke due to 503"}' + responses.add(responses.POST, + 'http://localhost:8070/api/processFulltextDocument', status=503, + body=status) + + resp = client.process_fulltext(FAKE_PDF_BYTES) + + # grobid gets POST 1x times + assert len(responses.calls) == 1 + + assert resp['status_code'] == 503 + assert resp['status'] == "error" + print(resp) + assert False + +@responses.activate +def test_grobid_503(): + + client = GrobidClient(host_url="http://localhost:8070") + + responses.add(responses.POST, + 'http://localhost:8070/api/processFulltextDocument', status=200, + body=REAL_TEI_XML, content_type='text/xml') + + resp = client.process_fulltext(FAKE_PDF_BYTES) + + # grobid gets POST 1x times + assert len(responses.calls) == 1 + + assert resp['status_code'] == 200 + assert resp['status'] == "success" + print(type(resp['tei_xml'])) + print(type(REAL_TEI_XML)) + assert resp['tei_xml'] == REAL_TEI_XML.decode('utf-8') + #assert resp['tei_xml'].split('\n')[:3] == REAL_TEI_XML.split('\n')[:3] -- cgit v1.2.3