diff options
| author | Bryan Newbold <bnewbold@archive.org> | 2018-04-05 18:51:08 -0700 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@archive.org> | 2018-04-05 18:51:08 -0700 | 
| commit | 5db075beaa55b2d619798154c06c2df625346972 (patch) | |
| tree | 38897e99a26b751d3e93b1a2f5308ea6fa05eabb /mapreduce/tests | |
| parent | 77577da13afe07b5177452122f4cee77e3357b4e (diff) | |
| download | sandcrawler-5db075beaa55b2d619798154c06c2df625346972.tar.gz sandcrawler-5db075beaa55b2d619798154c06c2df625346972.zip | |
progress on extractor
Diffstat (limited to 'mapreduce/tests')
| -rw-r--r-- | mapreduce/tests/test_extraction_cdx_grobid.py | 41 | 
1 files changed, 28 insertions, 13 deletions
| diff --git a/mapreduce/tests/test_extraction_cdx_grobid.py b/mapreduce/tests/test_extraction_cdx_grobid.py index 1d32c9f..46a89aa 100644 --- a/mapreduce/tests/test_extraction_cdx_grobid.py +++ b/mapreduce/tests/test_extraction_cdx_grobid.py @@ -1,13 +1,17 @@  import io  import json -import pytest  import mrjob +import pytest +import struct  import responses  import happybase_mock +from unittest import mock  from extraction_cdx_grobid import MRExtractCdxGrobid +FAKE_PDF_BYTES = b"%PDF SOME JUNK" + struct.pack("!q", 112853843) +  @pytest.fixture  def job():      """ @@ -22,13 +26,14 @@ def job():      job = MRExtractCdxGrobid(['--no-conf', '-'], hb_table=table)      return job - +@mock.patch('extraction_cdx_grobid.MRExtractCdxGrobid.fetch_warc_content', return_value=(FAKE_PDF_BYTES, None))  @responses.activate -def test_mapper_lines(job): +def test_mapper_lines(mock_fetch, job): -    fake_grobid = {} -    responses.add(responses.POST, 'http://localhost:9070/api/processFulltextDocument', status=200, -        body=json.dumps(fake_grobid), content_type='application/json') +    with open('tests/files/23b29ea36382680716be08fc71aa81bd226e8a85.xml', 'r') as f: +        real_tei_xml = f.read() +    responses.add(responses.POST, 'http://localhost:8070/api/processFulltextDocument', status=200, +        body=real_tei_xml, content_type='application/json')      raw = io.BytesIO(b"""  com,sagepub,cep)/content/28/9/960.full.pdf 20170705062200 http://cep.sagepub.com/content/28/9/960.full.pdf application/pdf 301 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 401 313356621 CITESEERX-CRAWL-2017-06-20-20170705061647307-00039-00048-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705062052659-00043-31209~wbgrp-svc284.us.archive.org~8443.warc.gz @@ -36,16 +41,23 @@ eu,eui,cadmus)/bitstream/handle/1814/36635/rscas_2015_03.pdf;jsessionid=76139301  com,pbworks,educ333b)/robots.txt 20170705063311 http://educ333b.pbworks.com/robots.txt text/plain 200 6VAUYENMOU2SK2OWNRPDD6WTQTECGZAD - - 638 398190140 CITESEERX-CRAWL-2017-06-20-20170705062707827-00049-00058-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705063158203-00053-31209~wbgrp-svc284.us.archive.org~8443.warc.gz  """) -    job.sandbox(stdin=raw) +    output = io.BytesIO() +    job.sandbox(stdin=raw, stdout=output) -    pytest.skip("need to mock wayback fetch")      job.run_mapper() +    # for debugging tests +    #print(output.getvalue().decode('utf-8')) +    #print(list(job.hb_table.scan())) +      # wayback gets FETCH 1x times +    # TODO:      # grobid gets POST 3x times +    # TODO:      # hbase  +    # TODO:      assert job.hb_table.row(b'1') == {} @@ -58,15 +70,18 @@ com,pbworks,educ333b)/robots.txt 20170705063311 http://educ333b.pbworks.com/robo      row = job.hb_table.row(b'sha1:MPCXVWMUTRUGFP36SLPHKDLY6NGU4S3J') -    assert struct.unpack("", row[b'file:size']) == 12345 +    assert struct.unpack("!q", row[b'file:size'])[0] == len(FAKE_PDF_BYTES)      assert row[b'file:mime'] == b"application/pdf" -    assert struct.unpack("", row[b'grobid0:status_code']) == 200 -    assert row[b'grobid0:quality'] == None # TODO +    assert struct.unpack("!q", row[b'grobid0:status_code'])[0] == 200 +    # TODO: assert row[b'grobid0:quality'] == None      status = json.loads(row[b'grobid0:status'].decode('utf-8')) -    assert type(row[b'grobid0:status']) == type(dict()) -    assert row[b'grobid0:tei_xml'] == "<xml><lorem>ipsum</lorem></xml>" +    assert type(status) == type(dict()) +    assert row[b'grobid0:tei_xml'].decode('utf-8') == real_tei_xml      tei_json = json.loads(row[b'grobid0:tei_json'].decode('utf-8'))      metadata = json.loads(row[b'grobid0:metadata'].decode('utf-8')) +    assert tei_json['title'] == metadata['title'] +    assert 'body' in tei_json +    assert 'body' not in metadata  def test_parse_cdx_invalid(job): | 
