diff options
Diffstat (limited to 'python/tests')
-rw-r--r-- | python/tests/test_grobid.py | 27 | ||||
-rw-r--r-- | python/tests/test_grobid2json.py | 6 | ||||
-rw-r--r-- | python/tests/test_html.py | 10 | ||||
-rw-r--r-- | python/tests/test_html_ingest.py | 1 | ||||
-rw-r--r-- | python/tests/test_html_metadata.py | 32 | ||||
-rw-r--r-- | python/tests/test_ingest.py | 119 | ||||
-rw-r--r-- | python/tests/test_live_wayback.py | 23 | ||||
-rw-r--r-- | python/tests/test_misc.py | 22 | ||||
-rw-r--r-- | python/tests/test_pdfextract.py | 12 | ||||
-rw-r--r-- | python/tests/test_pushers.py | 7 | ||||
-rw-r--r-- | python/tests/test_savepagenow.py | 126 | ||||
-rw-r--r-- | python/tests/test_wayback.py | 100 | ||||
-rw-r--r-- | python/tests/test_xml.py | 3 |
13 files changed, 294 insertions, 194 deletions
diff --git a/python/tests/test_grobid.py b/python/tests/test_grobid.py index 7d950df..55636dc 100644 --- a/python/tests/test_grobid.py +++ b/python/tests/test_grobid.py @@ -1,4 +1,3 @@ - import struct import pytest @@ -12,20 +11,21 @@ FAKE_PDF_BYTES = b"%PDF SOME JUNK" + struct.pack("!q", 112853843) with open('tests/files/23b29ea36382680716be08fc71aa81bd226e8a85.xml', 'rb') as f: REAL_TEI_XML = f.read() + @pytest.fixture def grobid_client(): - client = GrobidClient( - host_url="http://dummy-grobid", - ) + client = GrobidClient(host_url="http://dummy-grobid", ) return client + @responses.activate def test_grobid_503(grobid_client): status = b'{"status": "done broke due to 503"}' responses.add(responses.POST, - 'http://dummy-grobid/api/processFulltextDocument', status=503, - body=status) + 'http://dummy-grobid/api/processFulltextDocument', + status=503, + body=status) resp = grobid_client.process_fulltext(FAKE_PDF_BYTES) @@ -35,12 +35,15 @@ def test_grobid_503(grobid_client): assert resp['status_code'] == 503 assert resp['status'] == "error" + @responses.activate def test_grobid_success(grobid_client): responses.add(responses.POST, - 'http://dummy-grobid/api/processFulltextDocument', status=200, - body=REAL_TEI_XML, content_type='text/xml') + 'http://dummy-grobid/api/processFulltextDocument', + status=200, + body=REAL_TEI_XML, + content_type='text/xml') resp = grobid_client.process_fulltext(FAKE_PDF_BYTES) @@ -53,6 +56,7 @@ def test_grobid_success(grobid_client): #print(type(REAL_TEI_XML)) assert resp['tei_xml'] == REAL_TEI_XML.decode('ISO-8859-1') + @responses.activate def test_grobid_worker_cdx(grobid_client, wayback_client): @@ -60,8 +64,10 @@ def test_grobid_worker_cdx(grobid_client, wayback_client): worker = GrobidWorker(grobid_client, wayback_client, sink=sink) responses.add(responses.POST, - 'http://dummy-grobid/api/processFulltextDocument', status=200, - body=REAL_TEI_XML, content_type='text/xml') + 'http://dummy-grobid/api/processFulltextDocument', + status=200, + body=REAL_TEI_XML, + content_type='text/xml') with open('tests/files/example.cdx', 'r') as cdx_file: pusher = CdxLinePusher( @@ -76,4 +82,3 @@ def test_grobid_worker_cdx(grobid_client, wayback_client): assert pusher_counts['pushed'] == worker.counts['total'] assert len(responses.calls) == worker.counts['total'] - diff --git a/python/tests/test_grobid2json.py b/python/tests/test_grobid2json.py index b8999b1..7637871 100644 --- a/python/tests/test_grobid2json.py +++ b/python/tests/test_grobid2json.py @@ -1,4 +1,3 @@ - import json import xml @@ -8,14 +7,15 @@ from grobid2json import * def test_small_xml(): - + with open('tests/files/small.xml', 'r') as f: tei_xml = f.read() with open('tests/files/small.json', 'r') as f: - json_form = json.loads(f.read()) + json_form = json.loads(f.read()) assert teixml2json(tei_xml) == json_form + def test_invalid_xml(): with pytest.raises(xml.etree.ElementTree.ParseError): diff --git a/python/tests/test_html.py b/python/tests/test_html.py index d4bffc1..c5f422e 100644 --- a/python/tests/test_html.py +++ b/python/tests/test_html.py @@ -1,4 +1,3 @@ - import json import pytest @@ -13,8 +12,7 @@ def test_extract_fulltext_url(): assert resp == {} resp = extract_fulltext_url( - "http://dummy-site/", - b"""<html> + "http://dummy-site/", b"""<html> <head> <meta name="citation_pdf_url" content="http://www.example.com/content/271/20/11761.full.pdf"> </head> @@ -22,8 +20,7 @@ def test_extract_fulltext_url(): <h1>my big article here</h1> blah </body> - </html>""" - ) + </html>""") assert resp['pdf_url'] == "http://www.example.com/content/271/20/11761.full.pdf" assert resp['technique'] == "citation_pdf_url" @@ -32,4 +29,5 @@ def test_extract_fulltext_url(): "https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0213978", f.read(), ) - assert resp['pdf_url'] == "https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0213978&type=printable" + assert resp[ + 'pdf_url'] == "https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0213978&type=printable" diff --git a/python/tests/test_html_ingest.py b/python/tests/test_html_ingest.py index 943e5da..3bf94e2 100644 --- a/python/tests/test_html_ingest.py +++ b/python/tests/test_html_ingest.py @@ -1,4 +1,3 @@ - import datetime import pytest diff --git a/python/tests/test_html_metadata.py b/python/tests/test_html_metadata.py index 7f35d55..a4c1e41 100644 --- a/python/tests/test_html_metadata.py +++ b/python/tests/test_html_metadata.py @@ -1,4 +1,3 @@ - import datetime import pytest @@ -44,11 +43,12 @@ def test_html_metadata_plos() -> None: def test_html_metadata_elife() -> None: - + with open('tests/files/elife_article.html', 'r') as f: elife_html = f.read() - meta = html_extract_biblio("https://elifesciences.org/articles/44753", HTMLParser(elife_html)) + meta = html_extract_biblio("https://elifesciences.org/articles/44753", + HTMLParser(elife_html)) assert meta is not None assert meta.title == "Parallel visual circuitry in a basal chordate" assert meta.doi == "10.7554/eLife.44753" @@ -69,7 +69,7 @@ def test_html_metadata_elife() -> None: def test_html_metadata_peerj() -> None: - + with open('tests/files/peerj_oa_article.html', 'r') as f: peerj_html = f.read() @@ -78,15 +78,15 @@ def test_html_metadata_peerj() -> None: assert meta.title == "The state of OA: a large-scale analysis of the prevalence and impact of Open Access articles" assert meta.doi == "10.7717/peerj.4375" assert meta.contrib_names == [ - "Heather Piwowar", - "Jason Priem", - "Vincent Larivière", - "Juan Pablo Alperin", - "Lisa Matthias", - "Bree Norlander", - "Ashley Farley", - "Jevin West", - "Stefanie Haustein", + "Heather Piwowar", + "Jason Priem", + "Vincent Larivière", + "Juan Pablo Alperin", + "Lisa Matthias", + "Bree Norlander", + "Ashley Farley", + "Jevin West", + "Stefanie Haustein", ] assert meta.container_name == "PeerJ" # "2018-02-13" @@ -129,7 +129,7 @@ def test_html_metadata_ojs3() -> None: "Os Keyes", ] assert meta.container_name == "First Monday" - assert meta.container_abbrev == "1" # NOTE: bad source metadata + assert meta.container_abbrev == "1" # NOTE: bad source metadata assert meta.container_issn == "1396-0466" # "2020/09/10" assert meta.release_date == datetime.date(year=2020, month=9, day=10) @@ -150,6 +150,7 @@ def test_html_metadata_dlib() -> None: # "2017-05-15" assert meta.release_date == datetime.date(year=2017, month=5, day=15) + def test_html_metadata_dc_case() -> None: """ This tests that CSS selector <meta name=""> attribute lookups are not case-sensitive. @@ -167,10 +168,12 @@ def test_html_metadata_dc_case() -> None: assert meta is not None assert meta.issue == "123" + @pytest.fixture def adblock() -> Any: return load_adblock_rules() + def test_html_resources(adblock) -> None: with open('tests/files/dlib_05vanhyning.html', 'r') as f: @@ -227,4 +230,3 @@ def test_html_resources(adblock) -> None: HTMLParser(nature_html), adblock, ) - diff --git a/python/tests/test_ingest.py b/python/tests/test_ingest.py index 0965fcb..79f50f4 100644 --- a/python/tests/test_ingest.py +++ b/python/tests/test_ingest.py @@ -1,4 +1,3 @@ - import json import pytest @@ -12,9 +11,7 @@ from sandcrawler import * @pytest.fixture def ingest_worker(wayback_client, spn_client): - grobid_client = GrobidClient( - host_url="http://dummy-grobid", - ) + grobid_client = GrobidClient(host_url="http://dummy-grobid", ) worker = IngestFileWorker( wayback_client=wayback_client, spn_client=spn_client, @@ -22,14 +19,11 @@ def ingest_worker(wayback_client, spn_client): ) return worker + @pytest.fixture def ingest_worker_pdf(wayback_client_pdf, spn_client): - grobid_client = GrobidClient( - host_url="http://dummy-grobid", - ) - pgrest_client = SandcrawlerPostgrestClient( - api_url="http://dummy-postgrest", - ) + grobid_client = GrobidClient(host_url="http://dummy-grobid", ) + pgrest_client = SandcrawlerPostgrestClient(api_url="http://dummy-postgrest", ) worker = IngestFileWorker( wayback_client=wayback_client_pdf, spn_client=spn_client, @@ -50,37 +44,45 @@ def test_ingest_success(ingest_worker_pdf): 'base_url': "http://dummy-host/", } responses.add(responses.POST, - 'http://dummy-spnv2/save', - status=200, - body=json.dumps({"url": TARGET, "job_id": JOB_ID})) + 'http://dummy-spnv2/save', + status=200, + body=json.dumps({ + "url": TARGET, + "job_id": JOB_ID + })) responses.add(responses.GET, - 'http://dummy-spnv2/save/status/' + JOB_ID, - status=200, - body=json.dumps(PENDING_BODY)) + 'http://dummy-spnv2/save/status/' + JOB_ID, + status=200, + body=json.dumps(PENDING_BODY)) responses.add(responses.GET, - 'http://dummy-spnv2/save/status/' + JOB_ID, - status=200, - body=json.dumps(SUCCESS_BODY)) + 'http://dummy-spnv2/save/status/' + JOB_ID, + status=200, + body=json.dumps(SUCCESS_BODY)) responses.add(responses.GET, - 'http://dummy-cdx/cdx', - status=200, - body=json.dumps(CDX_SPN_HIT)) + 'http://dummy-cdx/cdx', + status=200, + body=json.dumps(CDX_SPN_HIT)) responses.add(responses.GET, - 'https://web.archive.org/web/{}id_/{}'.format("20180326070330", TARGET + "/redirect"), - status=200, - headers={"X-Archive-Src": "liveweb-whatever.warc.gz"}, - body=pdf_bytes) + 'https://web.archive.org/web/{}id_/{}'.format("20180326070330", + TARGET + "/redirect"), + status=200, + headers={"X-Archive-Src": "liveweb-whatever.warc.gz"}, + body=pdf_bytes) responses.add(responses.GET, - 'http://dummy-postgrest/grobid?sha1hex=eq.{}'.format("90ffd2359008d82298821d16b21778c5c39aec36"), - status=200, - body=json.dumps([])) + 'http://dummy-postgrest/grobid?sha1hex=eq.{}'.format( + "90ffd2359008d82298821d16b21778c5c39aec36"), + status=200, + body=json.dumps([])) responses.add(responses.GET, - 'http://dummy-postgrest/pdf_meta?sha1hex=eq.{}'.format("90ffd2359008d82298821d16b21778c5c39aec36"), - status=200, - body=json.dumps([])) + 'http://dummy-postgrest/pdf_meta?sha1hex=eq.{}'.format( + "90ffd2359008d82298821d16b21778c5c39aec36"), + status=200, + body=json.dumps([])) responses.add(responses.POST, - 'http://dummy-grobid/api/processFulltextDocument', status=200, - body=REAL_TEI_XML, content_type='text/xml') + 'http://dummy-grobid/api/processFulltextDocument', + status=200, + body=REAL_TEI_XML, + content_type='text/xml') resp = ingest_worker_pdf.process(request) @@ -108,6 +110,7 @@ def test_ingest_success(ingest_worker_pdf): assert resp['pdf_meta']['pdf_extra']['page_count'] == 1 assert resp['pdf_meta'].get('text') is None + @responses.activate def test_ingest_landing(ingest_worker): @@ -116,34 +119,39 @@ def test_ingest_landing(ingest_worker): 'base_url': "http://dummy-host/", } responses.add(responses.POST, - 'http://dummy-spnv2/save', - status=200, - body=json.dumps({"url": TARGET, "job_id": JOB_ID})) + 'http://dummy-spnv2/save', + status=200, + body=json.dumps({ + "url": TARGET, + "job_id": JOB_ID + })) responses.add(responses.GET, - 'http://dummy-spnv2/save/status/' + JOB_ID, - status=200, - body=json.dumps(PENDING_BODY)) + 'http://dummy-spnv2/save/status/' + JOB_ID, + status=200, + body=json.dumps(PENDING_BODY)) responses.add(responses.GET, - 'http://dummy-spnv2/save/status/' + JOB_ID, - status=200, - body=json.dumps(SUCCESS_BODY)) + 'http://dummy-spnv2/save/status/' + JOB_ID, + status=200, + body=json.dumps(SUCCESS_BODY)) responses.add(responses.GET, - 'http://dummy-cdx/cdx', - status=200, - body=json.dumps(CDX_SPN_HIT)) + 'http://dummy-cdx/cdx', + status=200, + body=json.dumps(CDX_SPN_HIT)) responses.add(responses.GET, - 'https://web.archive.org/web/{}id_/{}'.format("20180326070330", TARGET + "/redirect"), - status=200, - headers={"X-Archive-Src": "liveweb-whatever.warc.gz"}, - body=WARC_BODY) + 'https://web.archive.org/web/{}id_/{}'.format("20180326070330", + TARGET + "/redirect"), + status=200, + headers={"X-Archive-Src": "liveweb-whatever.warc.gz"}, + body=WARC_BODY) # this is for second time around; don't want to fetch same landing page # HTML again and result in a loop responses.add(responses.GET, - 'https://web.archive.org/web/{}id_/{}'.format("20180326070330", TARGET + "/redirect"), - status=200, - headers={"X-Archive-Src": "liveweb-whatever.warc.gz"}, - body="<html></html>") + 'https://web.archive.org/web/{}id_/{}'.format("20180326070330", + TARGET + "/redirect"), + status=200, + headers={"X-Archive-Src": "liveweb-whatever.warc.gz"}, + body="<html></html>") resp = ingest_worker.process(request) @@ -157,6 +165,7 @@ def test_ingest_landing(ingest_worker): assert 'revisit_cdx' not in resp assert 'grobid' not in resp + @responses.activate def test_ingest_blocklist(ingest_worker): @@ -192,6 +201,7 @@ def test_ingest_wall_blocklist(ingest_worker): assert resp['status'] == "skip-wall" assert resp['request'] == request + @responses.activate def test_ingest_cookie_blocklist(ingest_worker): @@ -205,4 +215,3 @@ def test_ingest_cookie_blocklist(ingest_worker): assert resp['hit'] == False assert resp['status'] == "blocked-cookie" assert resp['request'] == request - diff --git a/python/tests/test_live_wayback.py b/python/tests/test_live_wayback.py index b501dc3..0ff4902 100644 --- a/python/tests/test_live_wayback.py +++ b/python/tests/test_live_wayback.py @@ -1,4 +1,3 @@ - """ This file contains tests to run against "live" wayback services. They default to "skip" because you need authentication, and we shouldn't hit these services @@ -11,8 +10,8 @@ import json import pytest -from sandcrawler import (CdxApiClient, CdxApiError, CdxPartial, PetaboxError, SavePageNowClient, SavePageNowError, - WaybackClient, WaybackError, gen_file_metadata) +from sandcrawler import (CdxApiClient, CdxApiError, CdxPartial, PetaboxError, SavePageNowClient, + SavePageNowError, WaybackClient, WaybackError, gen_file_metadata) @pytest.fixture @@ -20,16 +19,19 @@ def cdx_client(): client = CdxApiClient() return client + @pytest.fixture def wayback_client(): client = WaybackClient() return client + @pytest.fixture def spn_client(): client = SavePageNowClient() return client + @pytest.mark.skip(reason="hits prod services, requires auth") def test_cdx_fetch(cdx_client): @@ -50,6 +52,7 @@ def test_cdx_fetch(cdx_client): with pytest.raises(KeyError): resp = cdx_client.fetch(url, "12345678123456") + @pytest.mark.skip(reason="hits prod services, requires auth") def test_cdx_lookup_best(cdx_client): @@ -68,13 +71,18 @@ def test_cdx_lookup_best(cdx_client): assert resp.mimetype == "text/html" assert resp.status_code == 200 + @pytest.mark.skip(reason="hits prod services, requires auth") def test_wayback_fetch(wayback_client): - resp = wayback_client.fetch_petabox(25683, 2676464871, "archiveteam_archivebot_go_20171205210002/arstechnica.co.uk-inf-20171201-061309-bb65j-00021.warc.gz") + resp = wayback_client.fetch_petabox( + 25683, 2676464871, + "archiveteam_archivebot_go_20171205210002/arstechnica.co.uk-inf-20171201-061309-bb65j-00021.warc.gz" + ) assert resp.body + @pytest.mark.skip(reason="hits prod services, requires auth") def test_lookup_resource_success(wayback_client): @@ -86,6 +94,7 @@ def test_lookup_resource_success(wayback_client): assert resp.terminal_url in (url, url.replace("https://", "http://")) assert resp.cdx.url in (url, url.replace("https://", "http://")) + @pytest.mark.skip(reason="hits prod services, requires auth") def test_cdx_fetch_spn2(cdx_client): @@ -107,8 +116,8 @@ def test_cdx_fetch_spn2(cdx_client): # https://onlinelibrary.wiley.com/doi/pdf/10.1002/lrh2.10209 20200110222410 #com,wiley,onlinelibrary)/doi/pdf/10.1002/lrh2.10209 20200110222410 https://onlinelibrary.wiley.com/doi/pdf/10.1002/lrh2.10209 text/html 200 VYW7JXFK6EC2KC537N5B7PHYZC4B6MZL - - 9006 815069841 liveweb-20200110214015-wwwb-spn18.us.archive.org-8002.warc.gz -#com,wiley,onlinelibrary)/doi/pdf/10.1002/lrh2.10209 20200110222410 https://onlinelibrary.wiley.com/doi/pdf/10.1002/lrh2.10209 text/html 302 AFI55BZE23HDTTEERUFKRP6WQVO3LOLS - - 1096 815066572 liveweb-20200110214015-wwwb-spn18.us.archive.org-8002.warc.gz -#com,wiley,onlinelibrary)/doi/pdf/10.1002/lrh2.10209 20200110222422 https://onlinelibrary.wiley.com/doi/pdf/10.1002/lrh2.10209 text/html 302 AFI55BZE23HDTTEERUFKRP6WQVO3LOLS - - 1094 307563475 liveweb-20200110214449-wwwb-spn18.us.archive.org-8003.warc.gz + #com,wiley,onlinelibrary)/doi/pdf/10.1002/lrh2.10209 20200110222410 https://onlinelibrary.wiley.com/doi/pdf/10.1002/lrh2.10209 text/html 302 AFI55BZE23HDTTEERUFKRP6WQVO3LOLS - - 1096 815066572 liveweb-20200110214015-wwwb-spn18.us.archive.org-8002.warc.gz + #com,wiley,onlinelibrary)/doi/pdf/10.1002/lrh2.10209 20200110222422 https://onlinelibrary.wiley.com/doi/pdf/10.1002/lrh2.10209 text/html 302 AFI55BZE23HDTTEERUFKRP6WQVO3LOLS - - 1094 307563475 liveweb-20200110214449-wwwb-spn18.us.archive.org-8003.warc.gz url = "https://onlinelibrary.wiley.com/doi/pdf/10.1002/lrh2.10209" datetime = "20200110222410" @@ -119,6 +128,7 @@ def test_cdx_fetch_spn2(cdx_client): assert resp.sha1b32 == "VYW7JXFK6EC2KC537N5B7PHYZC4B6MZL" assert resp.status_code == 200 + @pytest.mark.skip(reason="hits prod services, requires auth") def test_lookup_ftp(wayback_client): # ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/80/23/10.1177_1559827617708562.PMC6236633.pdf @@ -153,6 +163,7 @@ def test_lookup_ftp(wayback_client): file_meta = gen_file_metadata(resp.body) assert file_meta['sha1hex'] == resp.cdx.sha1hex + @pytest.mark.skip(reason="hits prod services, requires auth") def test_crawl_ftp(spn_client, wayback_client): diff --git a/python/tests/test_misc.py b/python/tests/test_misc.py index 0788c38..dcc1202 100644 --- a/python/tests/test_misc.py +++ b/python/tests/test_misc.py @@ -1,11 +1,11 @@ - import pytest -from sandcrawler import b32_hex, clean_url, gen_file_metadata, gen_file_metadata_path, parse_cdx_line +from sandcrawler import (b32_hex, clean_url, gen_file_metadata, gen_file_metadata_path, + parse_cdx_line) def test_gen_file_metadata(): - + # valid (but very small) PDF file with open('tests/files/dummy.pdf', 'rb') as f: file_meta = gen_file_metadata(f.read()) @@ -27,8 +27,9 @@ def test_gen_file_metadata(): assert fm['mimetype'] == 'text/plain' assert fm['size_bytes'] == 8 + def test_gen_file_metadata_path(): - + # valid (but very small) PDF file file_meta = gen_file_metadata_path('tests/files/dummy.pdf') assert file_meta == { @@ -39,11 +40,14 @@ def test_gen_file_metadata_path(): 'size_bytes': 13264, } + def test_b32_hex(): # valid b32 - assert b32_hex('sha1:TZCYZ2ULEHYGESS4L3RNH75I23KKFSMC') == '9e458cea8b21f0624a5c5ee2d3ffa8d6d4a2c982' - assert b32_hex('TZCYZ2ULEHYGESS4L3RNH75I23KKFSMC') == '9e458cea8b21f0624a5c5ee2d3ffa8d6d4a2c982' + assert b32_hex( + 'sha1:TZCYZ2ULEHYGESS4L3RNH75I23KKFSMC') == '9e458cea8b21f0624a5c5ee2d3ffa8d6d4a2c982' + assert b32_hex( + 'TZCYZ2ULEHYGESS4L3RNH75I23KKFSMC') == '9e458cea8b21f0624a5c5ee2d3ffa8d6d4a2c982' # sha1hex pass-through s = 'bda3c1017d52e826bbd1da51efad877272d300f9' @@ -53,6 +57,7 @@ def test_b32_hex(): with pytest.raises(ValueError): assert b32_hex('blah') == 'blah' + def test_parse_cdx_line(): raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz" @@ -73,6 +78,7 @@ def test_parse_cdx_line(): assert parse_cdx_line(raw + "\n") == correct assert parse_cdx_line(raw + " extra_field") == correct + def test_invalid_cdx(): print("missing warc") @@ -80,11 +86,11 @@ def test_invalid_cdx(): assert parse_cdx_line(raw) == None print("bad datetime") - raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 2070828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233i SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz" + raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 2070828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233i SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz" assert parse_cdx_line(raw) == None + def test_clean_url(): assert clean_url("http://BLAH.COM/file.pdf") == "http://blah.com/file.pdf" assert clean_url("https://opensky.ucar.edu:/islandora/object/articles%3A10809/datastream/PDF/view") == \ "https://opensky.ucar.edu/islandora/object/articles%3A10809/datastream/PDF/view" - diff --git a/python/tests/test_pdfextract.py b/python/tests/test_pdfextract.py index 1d334d6..146b138 100644 --- a/python/tests/test_pdfextract.py +++ b/python/tests/test_pdfextract.py @@ -1,4 +1,3 @@ - import struct import poppler @@ -6,11 +5,13 @@ import pytest import responses from test_wayback import cdx_client, wayback_client -from sandcrawler import BlackholeSink, CdxLinePusher, PdfExtractBlobWorker, PdfExtractWorker, WaybackClient +from sandcrawler import (BlackholeSink, CdxLinePusher, PdfExtractBlobWorker, PdfExtractWorker, + WaybackClient) from sandcrawler.pdfextract import process_pdf FAKE_PDF_BYTES = b"%PDF SOME JUNK" + struct.pack("!q", 112853843) + def test_process_fake_pdf(): resp = process_pdf(FAKE_PDF_BYTES) print(resp) @@ -21,7 +22,9 @@ def test_process_fake_pdf(): resp = process_pdf(pdf_bytes) assert resp.status == 'not-pdf' -@pytest.mark.skipif(poppler.version_string() == '0.71.0', reason="unsupported version of poppler") + +@pytest.mark.skipif(poppler.version_string() == '0.71.0', + reason="unsupported version of poppler") def test_process_dummy_pdf(): with open('tests/files/dummy.pdf', 'rb') as f: pdf_bytes = f.read() @@ -39,6 +42,7 @@ def test_process_dummy_pdf(): assert resp.pdf_extra['page0_width'] == 595 assert resp.pdf_extra['page_count'] == 1 + def test_pdfextract_worker_cdx(wayback_client): sink = BlackholeSink() @@ -56,6 +60,7 @@ def test_pdfextract_worker_cdx(wayback_client): assert pusher_counts['pushed'] == 7 assert pusher_counts['pushed'] == worker.counts['total'] + def test_pdfextract_blob_worker(): sink = BlackholeSink() @@ -65,4 +70,3 @@ def test_pdfextract_blob_worker(): pdf_bytes = f.read() worker.process(pdf_bytes) - diff --git a/python/tests/test_pushers.py b/python/tests/test_pushers.py index 62fa515..63f90d3 100644 --- a/python/tests/test_pushers.py +++ b/python/tests/test_pushers.py @@ -1,4 +1,3 @@ - import pytest from sandcrawler.workers import BlackholeSink, CdxLinePusher @@ -18,8 +17,10 @@ def test_cdx_line_pusher(): # HTTP 200 and application/pdf with open('tests/files/example.cdx', 'r') as cdx_file: - pusher = CdxLinePusher(sink, cdx_file, - filter_mimetypes=['application/pdf'], filter_http_statuses=[200, 226]) + pusher = CdxLinePusher(sink, + cdx_file, + filter_mimetypes=['application/pdf'], + filter_http_statuses=[200, 226]) counts = pusher.run() assert counts['total'] == 20 assert counts['skip-parse'] == 1 diff --git a/python/tests/test_savepagenow.py b/python/tests/test_savepagenow.py index f3fbfda..80334d9 100644 --- a/python/tests/test_savepagenow.py +++ b/python/tests/test_savepagenow.py @@ -1,4 +1,3 @@ - import json import pytest @@ -26,9 +25,7 @@ SUCCESS_BODY = { "timestamp": "20180326070330", "duration_sec": 6.203, "resources": [ - TARGET, - TARGET + "/redirect", - "http://brewster.kahle.org/", + TARGET, TARGET + "/redirect", "http://brewster.kahle.org/", "http://brewster.kahle.org/favicon.ico", "http://brewster.kahle.org/files/2011/07/bkheader-follow.jpg", "http://brewster.kahle.org/files/2016/12/amazon-unhappy.jpg", @@ -43,8 +40,7 @@ SUCCESS_BODY = { "http://brewster.kahle.org/wp-content/themes/twentyten/style.css", "http://brewster.kahle.org/wp-includes/js/wp-embed.min.js?ver=4.9.4", "http://brewster.kahle.org/wp-includes/js/wp-emoji-release.min.js?ver=4.9.4", - "http://platform.twitter.com/widgets.js", - "https://archive-it.org/piwik.js", + "http://platform.twitter.com/widgets.js", "https://archive-it.org/piwik.js", "https://platform.twitter.com/jot.html", "https://platform.twitter.com/js/button.556f0ea0e4da4e66cfdc182016dbd6db.js", "https://platform.twitter.com/widgets/follow_button.f47a2e0b4471326b6fa0f163bda46011.en.html", @@ -60,7 +56,7 @@ SUCCESS_BODY = { "https://www.syndikat.org/wp-includes/js/jquery/jquery.js?ver=1.12.4", "https://www.syndikat.org/wp-includes/js/wp-emoji-release.min.js?ver=4.9.4" ], - "outlinks":{ + "outlinks": { "https://archive.org/": "xxxxxx89b-f3ca-48d0-9ea6-1d1225e98695", "https://other.com": "yyyy89b-f3ca-48d0-9ea6-1d1225e98695" } @@ -74,10 +70,18 @@ ERROR_BODY = { "resources": [] } CDX_SPN_HIT = [ - ["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename"], - ["wiki,fatcat)/", "20180326070330", TARGET + "/redirect", "application/pdf", "200", CDX_BEST_SHA1B32, "-", "-", "8445", "108062304", "liveweb-20200108215212-wwwb-spn04.us.archive.org-kols1pud.warc.gz"], + [ + "urlkey", "timestamp", "original", "mimetype", "statuscode", "digest", "redirect", + "robotflags", "length", "offset", "filename" + ], + [ + "wiki,fatcat)/", "20180326070330", TARGET + "/redirect", "application/pdf", "200", + CDX_BEST_SHA1B32, "-", "-", "8445", "108062304", + "liveweb-20200108215212-wwwb-spn04.us.archive.org-kols1pud.warc.gz" + ], ] + @pytest.fixture def spn_client(): client = SavePageNowClient( @@ -88,25 +92,29 @@ def spn_client(): client.poll_seconds = 0.0 return client + @responses.activate def test_savepagenow_success(spn_client): responses.add(responses.POST, - 'http://dummy-spnv2/save', - status=200, - body=json.dumps({"url": TARGET, "job_id": JOB_ID})) + 'http://dummy-spnv2/save', + status=200, + body=json.dumps({ + "url": TARGET, + "job_id": JOB_ID + })) responses.add(responses.GET, - 'http://dummy-spnv2/save/status/' + JOB_ID, - status=200, - body=json.dumps(PENDING_BODY)) + 'http://dummy-spnv2/save/status/' + JOB_ID, + status=200, + body=json.dumps(PENDING_BODY)) responses.add(responses.GET, - 'http://dummy-spnv2/save/status/' + JOB_ID, - status=200, - body=json.dumps(PENDING_BODY)) + 'http://dummy-spnv2/save/status/' + JOB_ID, + status=200, + body=json.dumps(PENDING_BODY)) responses.add(responses.GET, - 'http://dummy-spnv2/save/status/' + JOB_ID, - status=200, - body=json.dumps(SUCCESS_BODY)) + 'http://dummy-spnv2/save/status/' + JOB_ID, + status=200, + body=json.dumps(SUCCESS_BODY)) resp = spn_client.save_url_now_v2(TARGET) @@ -119,21 +127,25 @@ def test_savepagenow_success(spn_client): assert resp.terminal_dt == SUCCESS_BODY['timestamp'] assert resp.resources == SUCCESS_BODY['resources'] + @responses.activate def test_savepagenow_remote_error(spn_client): responses.add(responses.POST, - 'http://dummy-spnv2/save', - status=200, - body=json.dumps({"url": TARGET, "job_id": JOB_ID})) + 'http://dummy-spnv2/save', + status=200, + body=json.dumps({ + "url": TARGET, + "job_id": JOB_ID + })) responses.add(responses.GET, - 'http://dummy-spnv2/save/status/' + JOB_ID, - status=200, - body=json.dumps(PENDING_BODY)) + 'http://dummy-spnv2/save/status/' + JOB_ID, + status=200, + body=json.dumps(PENDING_BODY)) responses.add(responses.GET, - 'http://dummy-spnv2/save/status/' + JOB_ID, - status=200, - body=json.dumps(ERROR_BODY)) + 'http://dummy-spnv2/save/status/' + JOB_ID, + status=200, + body=json.dumps(ERROR_BODY)) resp = spn_client.save_url_now_v2(TARGET) @@ -146,47 +158,56 @@ def test_savepagenow_remote_error(spn_client): assert resp.terminal_dt == None assert resp.resources == None + @responses.activate def test_savepagenow_500(spn_client): responses.add(responses.POST, - 'http://dummy-spnv2/save', - status=200, - body=json.dumps({"url": TARGET, "job_id": JOB_ID})) + 'http://dummy-spnv2/save', + status=200, + body=json.dumps({ + "url": TARGET, + "job_id": JOB_ID + })) responses.add(responses.GET, - 'http://dummy-spnv2/save/status/' + JOB_ID, - status=500, - body=json.dumps(ERROR_BODY)) + 'http://dummy-spnv2/save/status/' + JOB_ID, + status=500, + body=json.dumps(ERROR_BODY)) with pytest.raises(SavePageNowError): resp = spn_client.save_url_now_v2(TARGET) assert len(responses.calls) == 2 + @responses.activate def test_crawl_resource(spn_client, wayback_client): responses.add(responses.POST, - 'http://dummy-spnv2/save', - status=200, - body=json.dumps({"url": TARGET, "job_id": JOB_ID})) + 'http://dummy-spnv2/save', + status=200, + body=json.dumps({ + "url": TARGET, + "job_id": JOB_ID + })) responses.add(responses.GET, - 'http://dummy-spnv2/save/status/' + JOB_ID, - status=200, - body=json.dumps(PENDING_BODY)) + 'http://dummy-spnv2/save/status/' + JOB_ID, + status=200, + body=json.dumps(PENDING_BODY)) responses.add(responses.GET, - 'http://dummy-spnv2/save/status/' + JOB_ID, - status=200, - body=json.dumps(SUCCESS_BODY)) + 'http://dummy-spnv2/save/status/' + JOB_ID, + status=200, + body=json.dumps(SUCCESS_BODY)) responses.add(responses.GET, - 'http://dummy-cdx/cdx', - status=200, - body=json.dumps(CDX_SPN_HIT)) + 'http://dummy-cdx/cdx', + status=200, + body=json.dumps(CDX_SPN_HIT)) responses.add(responses.GET, - 'https://web.archive.org/web/{}id_/{}'.format("20180326070330", TARGET + "/redirect"), - status=200, - headers={"X-Archive-Src": "liveweb-whatever.warc.gz"}, - body=WARC_BODY) + 'https://web.archive.org/web/{}id_/{}'.format("20180326070330", + TARGET + "/redirect"), + status=200, + headers={"X-Archive-Src": "liveweb-whatever.warc.gz"}, + body=WARC_BODY) print('https://web.archive.org/web/{}id_/{}'.format("20180326070330", TARGET + "/redirect")) resp = spn_client.crawl_resource(TARGET, wayback_client) @@ -201,4 +222,3 @@ def test_crawl_resource(spn_client, wayback_client): assert type(resp.cdx) == CdxPartial with pytest.raises(AttributeError): print(resp.cdx.warc_path) - diff --git a/python/tests/test_wayback.py b/python/tests/test_wayback.py index 83311b9..6ccf775 100644 --- a/python/tests/test_wayback.py +++ b/python/tests/test_wayback.py @@ -1,4 +1,3 @@ - import json import pytest @@ -10,27 +9,66 @@ CDX_TARGET = "http://fatcat.wiki/" CDX_DT = "20180812220054" # cdx -m exact -p output=json -p from=20180812220054 -p to=20180812220054 http://fatcat.wiki/ CDX_SINGLE_HIT = [ - ["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename"], - ["wiki,fatcat)/", CDX_DT, CDX_TARGET, "text/html", "200", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"], + [ + "urlkey", "timestamp", "original", "mimetype", "statuscode", "digest", "redirect", + "robotflags", "length", "offset", "filename" + ], + [ + "wiki,fatcat)/", CDX_DT, CDX_TARGET, "text/html", "200", + "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", + "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz" + ], ] CDX_BEST_SHA1B32 = "AAAAAAAAASIHDJIEP7ZW53DLRX5NFIJR" # cdx -m exact -p output=json -p from=20180812220054 -p to=20180812220054 http://fatcat.wiki/ CDX_MULTI_HIT = [ - ["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename"], - ["wiki,fatcat)/", CDX_DT, CDX_TARGET, "text/html", "200", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"], - # sooner, but not right mimetype - ["wiki,fatcat)/", "20180912220054", CDX_TARGET, "text/html", "200", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"], - # sooner and mimetype, but wrong status code - ["wiki,fatcat)/", "20180912220054", CDX_TARGET, "application/pdf", "400", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"], - ["wiki,fatcat)/", "20180912220054", CDX_TARGET, "application/pdf", "500", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"], - ["wiki,fatcat)/", "20180912220054", CDX_TARGET, "application/pdf", "150", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"], - # "best" - ["wiki,fatcat)/", CDX_DT, CDX_TARGET, "application/pdf", "200", CDX_BEST_SHA1B32, "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"], - # older - ["wiki,fatcat)/", "20180712220054", CDX_TARGET, "application/pdf", "200", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"], + [ + "urlkey", "timestamp", "original", "mimetype", "statuscode", "digest", "redirect", + "robotflags", "length", "offset", "filename" + ], + [ + "wiki,fatcat)/", CDX_DT, CDX_TARGET, "text/html", "200", + "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", + "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz" + ], + # sooner, but not right mimetype + [ + "wiki,fatcat)/", "20180912220054", CDX_TARGET, "text/html", "200", + "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", + "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz" + ], + # sooner and mimetype, but wrong status code + [ + "wiki,fatcat)/", "20180912220054", CDX_TARGET, "application/pdf", "400", + "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", + "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz" + ], + [ + "wiki,fatcat)/", "20180912220054", CDX_TARGET, "application/pdf", "500", + "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", + "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz" + ], + [ + "wiki,fatcat)/", "20180912220054", CDX_TARGET, "application/pdf", "150", + "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", + "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz" + ], + # "best" + [ + "wiki,fatcat)/", CDX_DT, CDX_TARGET, "application/pdf", "200", CDX_BEST_SHA1B32, "-", + "-", "8445", "108062304", + "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz" + ], + # older + [ + "wiki,fatcat)/", "20180712220054", CDX_TARGET, "application/pdf", "200", + "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", + "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz" + ], ] + @pytest.fixture def cdx_client(): client = CdxApiClient( @@ -39,13 +77,14 @@ def cdx_client(): ) return client + @responses.activate def test_cdx_fetch(cdx_client): responses.add(responses.GET, - 'http://dummy-cdx/cdx', - status=200, - body=json.dumps(CDX_SINGLE_HIT)) + 'http://dummy-cdx/cdx', + status=200, + body=json.dumps(CDX_SINGLE_HIT)) resp = cdx_client.fetch(CDX_TARGET, CDX_DT) @@ -58,6 +97,7 @@ def test_cdx_fetch(cdx_client): assert resp.warc_offset == 108062304 assert resp.warc_path == "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz" + @responses.activate def test_cdx_fetch_errors(cdx_client): @@ -65,9 +105,9 @@ def test_cdx_fetch_errors(cdx_client): resp = cdx_client.fetch(CDX_TARGET, "2019") responses.add(responses.GET, - 'http://dummy-cdx/cdx', - status=200, - body=json.dumps(CDX_SINGLE_HIT)) + 'http://dummy-cdx/cdx', + status=200, + body=json.dumps(CDX_SINGLE_HIT)) with pytest.raises(KeyError): resp = cdx_client.fetch(CDX_TARGET, "20180812220055") @@ -78,13 +118,14 @@ def test_cdx_fetch_errors(cdx_client): resp = cdx_client.fetch(CDX_TARGET, CDX_DT) assert len(responses.calls) == 3 + @responses.activate def test_cdx_lookup_best(cdx_client): responses.add(responses.GET, - 'http://dummy-cdx/cdx', - status=200, - body=json.dumps(CDX_MULTI_HIT)) + 'http://dummy-cdx/cdx', + status=200, + body=json.dumps(CDX_MULTI_HIT)) resp = cdx_client.lookup_best(CDX_TARGET, best_mimetype="application/pdf") @@ -95,6 +136,7 @@ def test_cdx_lookup_best(cdx_client): assert resp.sha1b32 == CDX_BEST_SHA1B32 assert resp.warc_path == CDX_SINGLE_HIT[1][-1] + WARC_TARGET = "http://fatcat.wiki/" WARC_BODY = b""" <html> @@ -108,6 +150,7 @@ WARC_BODY = b""" </html> """ + @pytest.fixture def wayback_client(cdx_client, mocker): client = WaybackClient( @@ -127,6 +170,7 @@ def wayback_client(cdx_client, mocker): return client + @pytest.fixture def wayback_client_pdf(cdx_client, mocker): @@ -150,6 +194,7 @@ def wayback_client_pdf(cdx_client, mocker): return client + @responses.activate def test_wayback_fetch(wayback_client): resp = wayback_client.fetch_petabox(123, 456789, "here/there.warc.gz") @@ -159,13 +204,14 @@ def test_wayback_fetch(wayback_client): resp = wayback_client.fetch_petabox_body(123, 456789, "here/there.warc.gz") assert resp == WARC_BODY + @responses.activate def test_lookup_resource_success(wayback_client): responses.add(responses.GET, - 'http://dummy-cdx/cdx', - status=200, - body=json.dumps(CDX_MULTI_HIT)) + 'http://dummy-cdx/cdx', + status=200, + body=json.dumps(CDX_MULTI_HIT)) resp = wayback_client.lookup_resource(CDX_TARGET) diff --git a/python/tests/test_xml.py b/python/tests/test_xml.py index a996c56..1742f3a 100644 --- a/python/tests/test_xml.py +++ b/python/tests/test_xml.py @@ -1,11 +1,10 @@ - import pytest from sandcrawler.xml import xml_reserialize def test_xml_reserialize() -> None: - + with open('tests/files/scielo_article.jats.xml', 'rb') as f: raw_xml = f.read() |