diff options
-rw-r--r-- | python/sandcrawler/grobid.py | 6 | ||||
-rw-r--r-- | python/tests/test_grobid.py | 24 | ||||
-rw-r--r-- | python/tests/test_wayback.py | 2 |
3 files changed, 17 insertions, 15 deletions
diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py index 9fd5ad4..31af974 100644 --- a/python/sandcrawler/grobid.py +++ b/python/sandcrawler/grobid.py @@ -85,8 +85,8 @@ class GrobidWorker(SandcrawlerWorker): if not self.wayback_client: raise Exception("wayback client not configured for this GrobidWorker") try: - blob = self.wayback_client.fetch_warc_content(record['warc_path'], - record['warc_offset'], record['warc_csize']) + blob = self.wayback_client.fetch_petabox_body(record['warc_csize'], + record['warc_offset'], record['warc_path']) except WaybackError as we: return dict(status="error-wayback", error_msg=str(we), source=record) elif record.get('url') and record.get('datetime'): @@ -94,7 +94,7 @@ class GrobidWorker(SandcrawlerWorker): if not self.wayback_client: raise Exception("wayback client not configured for this GrobidWorker") try: - blob = self.wayback_client.fetch_url_datetime(record['url'], record['datetime']) + blob = self.wayback_client.fetch_warc_by_url_dt(record['url'], record['datetime']) except WaybackError as we: return dict(status="error-wayback", error_msg=str(we), source=record) elif record.get('item') and record.get('path'): diff --git a/python/tests/test_grobid.py b/python/tests/test_grobid.py index 10560cd..330c384 100644 --- a/python/tests/test_grobid.py +++ b/python/tests/test_grobid.py @@ -4,6 +4,7 @@ import struct import responses from sandcrawler import GrobidClient, GrobidWorker, CdxLinePusher, BlackholeSink, WaybackClient +from test_wayback import * FAKE_PDF_BYTES = b"%PDF SOME JUNK" + struct.pack("!q", 112853843) @@ -11,17 +12,22 @@ FAKE_PDF_BYTES = b"%PDF SOME JUNK" + struct.pack("!q", 112853843) with open('tests/files/23b29ea36382680716be08fc71aa81bd226e8a85.xml', 'rb') as f: REAL_TEI_XML = f.read() -@responses.activate -def test_grobid_503(): +@pytest.fixture +def grobid_client(): + client = GrobidClient( + host_url="http://localhost:8070", + ) + return client - client = GrobidClient(host_url="http://localhost:8070") +@responses.activate +def test_grobid_503(grobid_client): status = b'{"status": "done broke due to 503"}' responses.add(responses.POST, 'http://localhost:8070/api/processFulltextDocument', status=503, body=status) - resp = client.process_fulltext(FAKE_PDF_BYTES) + resp = grobid_client.process_fulltext(FAKE_PDF_BYTES) # grobid gets POST 1x times assert len(responses.calls) == 1 @@ -31,15 +37,13 @@ def test_grobid_503(): @responses.activate @pytest.mark.skip(reason="XXX: need to fix unicode/bytes something something") -def test_grobid_success(): - - client = GrobidClient(host_url="http://localhost:8070") +def test_grobid_success(grobid_client): responses.add(responses.POST, 'http://localhost:8070/api/processFulltextDocument', status=200, body=REAL_TEI_XML, content_type='text/xml') - resp = client.process_fulltext(FAKE_PDF_BYTES) + resp = grobid_client.process_fulltext(FAKE_PDF_BYTES) # grobid gets POST 1x times assert len(responses.calls) == 1 @@ -52,11 +56,9 @@ def test_grobid_success(): #assert resp['tei_xml'].split('\n')[:3] == REAL_TEI_XML.split('\n')[:3] @responses.activate -def test_grobid_worker_cdx(): +def test_grobid_worker_cdx(grobid_client, wayback_client): sink = BlackholeSink() - grobid_client = GrobidClient(host_url="http://localhost:8070") - wayback_client = WaybackClient() worker = GrobidWorker(grobid_client, wayback_client, sink=sink) responses.add(responses.POST, diff --git a/python/tests/test_wayback.py b/python/tests/test_wayback.py index eeb4b37..8d15d70 100644 --- a/python/tests/test_wayback.py +++ b/python/tests/test_wayback.py @@ -96,7 +96,7 @@ def test_cdx_lookup_best(cdx_client): assert resp.warc_path == CDX_SINGLE_HIT[1][-1] WARC_TARGET = "http://fatcat.wiki/" -WARC_BODY = "<html>some stuff</html>" +WARC_BODY = b"<html>some stuff</html>" @pytest.fixture def wayback_client(cdx_client, mocker): |