diff options
| author | Bryan Newbold <bnewbold@archive.org> | 2020-01-08 19:30:38 -0800 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@archive.org> | 2020-01-09 16:31:40 -0800 | 
| commit | 6454cdc93424b23f484fc56e1f9f986490d05c2b (patch) | |
| tree | 14e76469fdf9dfead292db1b26bd90b475feaf4e /python/tests | |
| parent | 318bcf9dbc244a1130b74252b7842cc4eb954bfd (diff) | |
| download | sandcrawler-6454cdc93424b23f484fc56e1f9f986490d05c2b.tar.gz sandcrawler-6454cdc93424b23f484fc56e1f9f986490d05c2b.zip | |
wrap up basic (locally testable) ingest refactor
Diffstat (limited to 'python/tests')
| -rw-r--r-- | python/tests/test_wayback.py | 52 | 
1 files changed, 48 insertions, 4 deletions
| diff --git a/python/tests/test_wayback.py b/python/tests/test_wayback.py index 8d15d70..2aafe7c 100644 --- a/python/tests/test_wayback.py +++ b/python/tests/test_wayback.py @@ -54,8 +54,8 @@ def test_cdx_fetch(cdx_client):      assert resp.datetime == CDX_DT      assert resp.url == CDX_TARGET      assert resp.sha1b32 == "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR" -    assert resp.warc_csize == "8445" -    assert resp.warc_offset == "108062304" +    assert resp.warc_csize == 8445 +    assert resp.warc_offset == 108062304      assert resp.warc_path == "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"  @responses.activate @@ -96,7 +96,17 @@ def test_cdx_lookup_best(cdx_client):      assert resp.warc_path == CDX_SINGLE_HIT[1][-1]  WARC_TARGET = "http://fatcat.wiki/" -WARC_BODY = b"<html>some stuff</html>" +WARC_BODY = b""" +<html> +  <head> +      <meta name="citation_pdf_url" content="http://www.example.com/content/271/20/11761.full.pdf"> +  </head> +  <body> +    <h1>my big article here</h1> +    blah +  </body> +</html> +"""  @pytest.fixture  def wayback_client(cdx_client, mocker): @@ -116,7 +126,30 @@ def wayback_client(cdx_client, mocker):      return client -def test_wayback_fetch(wayback_client, mocker): +@pytest.fixture +def wayback_client_pdf(cdx_client, mocker): + +    with open('tests/files/dummy.pdf', 'rb') as f: +        pdf_bytes = f.read() + +    client = WaybackClient( +        cdx_client=cdx_client, +        petabox_webdata_secret="dummy-petabox-secret", +    ) +    # mock out the wayback store with mock stuff +    client.rstore = mocker.Mock() +    resource = mocker.Mock() +    client.rstore.load_resource = mocker.MagicMock(return_value=resource) +    resource.get_status = mocker.MagicMock(return_value=[200]) +    resource.get_location = mocker.MagicMock(return_value=[WARC_TARGET]) +    body = mocker.Mock() +    resource.open_raw_content = mocker.MagicMock(return_value=body) +    body.read = mocker.MagicMock(return_value=pdf_bytes) + +    return client + +@responses.activate +def test_wayback_fetch(wayback_client):      resp = wayback_client.fetch_petabox(123, 456789, "here/there.warc.gz")      assert resp.body == WARC_BODY      assert resp.location == WARC_TARGET @@ -124,3 +157,14 @@ def test_wayback_fetch(wayback_client, mocker):      resp = wayback_client.fetch_petabox_body(123, 456789, "here/there.warc.gz")      assert resp == WARC_BODY +@responses.activate +def test_lookup_resource_success(wayback_client): + +    responses.add(responses.GET, +        'http://dummy-cdx/cdx', +        status=200, +        body=json.dumps(CDX_MULTI_HIT)) + +    resp = wayback_client.lookup_resource(CDX_TARGET) + +    assert resp.hit == True | 
