aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--python/sandcrawler/grobid.py6
-rw-r--r--python/tests/test_grobid.py24
-rw-r--r--python/tests/test_wayback.py2
3 files changed, 17 insertions, 15 deletions
diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py
index 9fd5ad4..31af974 100644
--- a/python/sandcrawler/grobid.py
+++ b/python/sandcrawler/grobid.py
@@ -85,8 +85,8 @@ class GrobidWorker(SandcrawlerWorker):
if not self.wayback_client:
raise Exception("wayback client not configured for this GrobidWorker")
try:
- blob = self.wayback_client.fetch_warc_content(record['warc_path'],
- record['warc_offset'], record['warc_csize'])
+ blob = self.wayback_client.fetch_petabox_body(record['warc_csize'],
+ record['warc_offset'], record['warc_path'])
except WaybackError as we:
return dict(status="error-wayback", error_msg=str(we), source=record)
elif record.get('url') and record.get('datetime'):
@@ -94,7 +94,7 @@ class GrobidWorker(SandcrawlerWorker):
if not self.wayback_client:
raise Exception("wayback client not configured for this GrobidWorker")
try:
- blob = self.wayback_client.fetch_url_datetime(record['url'], record['datetime'])
+ blob = self.wayback_client.fetch_warc_by_url_dt(record['url'], record['datetime'])
except WaybackError as we:
return dict(status="error-wayback", error_msg=str(we), source=record)
elif record.get('item') and record.get('path'):
diff --git a/python/tests/test_grobid.py b/python/tests/test_grobid.py
index 10560cd..330c384 100644
--- a/python/tests/test_grobid.py
+++ b/python/tests/test_grobid.py
@@ -4,6 +4,7 @@ import struct
import responses
from sandcrawler import GrobidClient, GrobidWorker, CdxLinePusher, BlackholeSink, WaybackClient
+from test_wayback import *
FAKE_PDF_BYTES = b"%PDF SOME JUNK" + struct.pack("!q", 112853843)
@@ -11,17 +12,22 @@ FAKE_PDF_BYTES = b"%PDF SOME JUNK" + struct.pack("!q", 112853843)
with open('tests/files/23b29ea36382680716be08fc71aa81bd226e8a85.xml', 'rb') as f:
REAL_TEI_XML = f.read()
-@responses.activate
-def test_grobid_503():
+@pytest.fixture
+def grobid_client():
+ client = GrobidClient(
+ host_url="http://localhost:8070",
+ )
+ return client
- client = GrobidClient(host_url="http://localhost:8070")
+@responses.activate
+def test_grobid_503(grobid_client):
status = b'{"status": "done broke due to 503"}'
responses.add(responses.POST,
'http://localhost:8070/api/processFulltextDocument', status=503,
body=status)
- resp = client.process_fulltext(FAKE_PDF_BYTES)
+ resp = grobid_client.process_fulltext(FAKE_PDF_BYTES)
# grobid gets POST 1x times
assert len(responses.calls) == 1
@@ -31,15 +37,13 @@ def test_grobid_503():
@responses.activate
@pytest.mark.skip(reason="XXX: need to fix unicode/bytes something something")
-def test_grobid_success():
-
- client = GrobidClient(host_url="http://localhost:8070")
+def test_grobid_success(grobid_client):
responses.add(responses.POST,
'http://localhost:8070/api/processFulltextDocument', status=200,
body=REAL_TEI_XML, content_type='text/xml')
- resp = client.process_fulltext(FAKE_PDF_BYTES)
+ resp = grobid_client.process_fulltext(FAKE_PDF_BYTES)
# grobid gets POST 1x times
assert len(responses.calls) == 1
@@ -52,11 +56,9 @@ def test_grobid_success():
#assert resp['tei_xml'].split('\n')[:3] == REAL_TEI_XML.split('\n')[:3]
@responses.activate
-def test_grobid_worker_cdx():
+def test_grobid_worker_cdx(grobid_client, wayback_client):
sink = BlackholeSink()
- grobid_client = GrobidClient(host_url="http://localhost:8070")
- wayback_client = WaybackClient()
worker = GrobidWorker(grobid_client, wayback_client, sink=sink)
responses.add(responses.POST,
diff --git a/python/tests/test_wayback.py b/python/tests/test_wayback.py
index eeb4b37..8d15d70 100644
--- a/python/tests/test_wayback.py
+++ b/python/tests/test_wayback.py
@@ -96,7 +96,7 @@ def test_cdx_lookup_best(cdx_client):
assert resp.warc_path == CDX_SINGLE_HIT[1][-1]
WARC_TARGET = "http://fatcat.wiki/"
-WARC_BODY = "<html>some stuff</html>"
+WARC_BODY = b"<html>some stuff</html>"
@pytest.fixture
def wayback_client(cdx_client, mocker):