1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
|
import json
import pytest
import responses
from sandcrawler import *
from test_wayback import *
from test_savepagenow import *
from test_grobid import REAL_TEI_XML
@pytest.fixture
def ingest_worker(wayback_client, spn_client):
grobid_client = GrobidClient(
host_url="http://localhost:8070",
)
worker = IngestFileWorker(
wayback_client=wayback_client,
spn_client=spn_client,
grobid_client=grobid_client,
)
return worker
@pytest.fixture
def ingest_worker_pdf(wayback_client_pdf, spn_client):
grobid_client = GrobidClient(
host_url="http://localhost:8070",
)
worker = IngestFileWorker(
wayback_client=wayback_client_pdf,
spn_client=spn_client,
grobid_client=grobid_client,
)
return worker
@responses.activate
def test_ingest_success(ingest_worker_pdf):
with open('tests/files/dummy.pdf', 'rb') as f:
pdf_bytes = f.read()
request = {
'ingest_type': 'pdf',
'base_url': "http://dummy-host/",
}
responses.add(responses.POST,
'http://dummy-spnv2/save',
status=200,
body=json.dumps({"url": TARGET, "job_id": JOB_ID}))
responses.add(responses.GET,
'http://dummy-spnv2/save/status/' + JOB_ID,
status=200,
body=json.dumps(PENDING_BODY))
responses.add(responses.GET,
'http://dummy-spnv2/save/status/' + JOB_ID,
status=200,
body=json.dumps(SUCCESS_BODY))
responses.add(responses.GET,
'http://dummy-cdx/cdx',
status=200,
body=json.dumps(CDX_SPN_HIT))
responses.add(responses.GET,
'https://web.archive.org/web/{}id_/{}'.format("20180326070330", TARGET + "/redirect"),
status=200,
headers={"X-Archive-Src": "liveweb-whatever.warc.gz"},
body=pdf_bytes)
responses.add(responses.POST,
'http://localhost:8070/api/processFulltextDocument', status=200,
body=REAL_TEI_XML, content_type='text/xml')
resp = ingest_worker_pdf.process(request)
print(resp)
assert resp['hit'] == True
assert resp['status'] == "success"
assert resp['request'] == request
assert resp['file_meta']['size_bytes']
assert resp['grobid']
assert not 'tei_xml' in resp['grobid']
assert resp['terminal']
@responses.activate
def test_ingest_landing(ingest_worker):
request = {
'ingest_type': 'pdf',
'base_url': "http://dummy-host/",
}
responses.add(responses.POST,
'http://dummy-spnv2/save',
status=200,
body=json.dumps({"url": TARGET, "job_id": JOB_ID}))
responses.add(responses.GET,
'http://dummy-spnv2/save/status/' + JOB_ID,
status=200,
body=json.dumps(PENDING_BODY))
responses.add(responses.GET,
'http://dummy-spnv2/save/status/' + JOB_ID,
status=200,
body=json.dumps(SUCCESS_BODY))
responses.add(responses.GET,
'http://dummy-cdx/cdx',
status=200,
body=json.dumps(CDX_SPN_HIT))
responses.add(responses.GET,
'https://web.archive.org/web/{}id_/{}'.format("20180326070330", TARGET + "/redirect"),
status=200,
headers={"X-Archive-Src": "liveweb-whatever.warc.gz"},
body=WARC_BODY)
# this is for second time around; don't want to fetch same landing page
# HTML again and result in a loop
responses.add(responses.GET,
'https://web.archive.org/web/{}id_/{}'.format("20180326070330", TARGET + "/redirect"),
status=200,
headers={"X-Archive-Src": "liveweb-whatever.warc.gz"},
body="<html></html>")
resp = ingest_worker.process(request)
print(resp)
assert resp['hit'] == False
assert resp['status'] == "no-pdf-link"
assert resp['request'] == request
assert 'grobid' not in resp
|