refactor out some common code

author: Bryan Newbold <bnewbold@archive.org> 2018-04-04 13:31:25 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2018-04-04 13:31:44 -0700
commit: 7ecb1334506cab470399d9f493e5d8a651c9c2cc (patch)
tree: 713d65e2e2e89d08f0dd93c052445002512c4769 /mapreduce
parent: 1dad0d9e54bfae93eebea47f8a3cb291cdd645c5 (diff)
download: sandcrawler-7ecb1334506cab470399d9f493e5d8a651c9c2cc.tar.gz
sandcrawler-7ecb1334506cab470399d9f493e5d8a651c9c2cc.zip
5 files changed, 133 insertions, 184 deletions
diff --git a/mapreduce/backfill_hbase_from_cdx.py b/mapreduce/backfill_hbase_from_cdx.py
index fe37bd5..8a28ec1 100755
--- a/mapreduce/backfill_hbase_from_cdx.py
+++ b/mapreduce/backfill_hbase_from_cdx.py
@@ -18,104 +18,10 @@ TODO:
 
 import sys
 import json
-from datetime import datetime
 import happybase
 import mrjob
 from mrjob.job import MRJob
-
-NORMAL_MIME = (
-    'application/pdf',
-    'application/postscript',
-    'text/html',
-    'text/xml',
-)
-
-def normalize_mime(raw):
-    raw = raw.lower()
-    for norm in NORMAL_MIME:
-        if raw.startswith(norm):
-            return norm
-
-    # Special cases 
-    if raw.startswith('application/xml'):
-        return 'text/xml'
-    if raw.startswith('application/x-pdf'):
-        return 'application/pdf'
-    return None
-
-def test_normalize_mime():
-    assert normalize_mime("asdf") == None
-    assert normalize_mime("application/pdf") == "application/pdf"
-    assert normalize_mime("application/pdf+journal") == "application/pdf"
-    assert normalize_mime("Application/PDF") == "application/pdf"
-    assert normalize_mime("application/p") == None
-    assert normalize_mime("application/xml+stuff") == "text/xml"
-
-def transform_line(raw_cdx):
-
-    cdx = raw_cdx.split()
-    if len(cdx) < 11:
-        return None
-
-    surt = cdx[0]
-    dt = cdx[1]
-    url = cdx[2]
-    mime = normalize_mime(cdx[3])
-    http_status = cdx[4]
-    key = cdx[5]
-    c_size = cdx[8]
-    offset = cdx[9]
-    warc = cdx[10]
-
-    if not (key.isalnum() and c_size.isdigit() and offset.isdigit()
-            and http_status == "200" and len(key) == 32 and dt.isdigit()):
-        return None
-
-    if '-' in (surt, dt, url, mime, http_status, key, c_size, offset, warc):
-        return None
-
-    key = "sha1:{}".format(key)
-
-    info = dict(surt=surt, dt=dt, url=url, c_size=int(c_size),
-        offset=int(offset), warc=warc)
-
-    warc_file = warc.split('/')[-1]
-    dt_iso = datetime.strptime(dt, "%Y%m%d%H%M%S").isoformat()
-    try:
-        dt_iso = datetime.strptime(dt, "%Y%m%d%H%M%S").isoformat()
-    except:
-        return None
-
-    # 'i' intentionally not set
-    heritrix = dict(u=url, d=dt_iso, f=warc_file, o=int(offset), c=1)
-    return {'key': key, 'file:mime': mime, 'file:cdx': info, 'f:c': heritrix}
-
-def test_transform_line():
-
-    raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz"
-    correct = {
-        'key': "sha1:WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G",
-        'file:mime': "application/pdf",
-        'file:cdx': {
-            'surt': "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf",
-            'url': "https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf",
-            'dt': "20170828233154",
-            'warc': "SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz",
-            'offset': 931661233,
-            'c_size': 210251,
-        },
-        'f:c': {
-            'u': "https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf",
-            'd': "2017-08-28T23:31:54",
-            'f': "SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz",
-            'o': 931661233,
-            'c': 1,
-        }
-    }
-
-    assert transform_line(raw) == correct
-    assert transform_line(raw + "\n") == correct
-    assert transform_line(raw + " extra_field") == correct
+from common import parse_cdx_line
 
 
 class MRCDXBackfillHBase(MRJob):
@@ -171,7 +77,7 @@ class MRCDXBackfillHBase(MRJob):
             self.increment_counter('lines', 'invalid')
             return _, dict(status="invalid")
 
-        info = transform_line(raw_cdx)
+        info = parse_cdx_line(raw_cdx)
         if info is None:
             self.increment_counter('lines', 'invalid')
             return _, dict(status="invalid")
diff --git a/mapreduce/common.py b/mapreduce/common.py
new file mode 100644
index 0000000..65b2744
--- /dev/null
+++ b/mapreduce/common.py
@@ -0,0 +1,71 @@
+
+from datetime import datetime
+
+NORMAL_MIME = (
+    'application/pdf',
+    'application/postscript',
+    'text/html',
+    'text/xml',
+)
+
+def normalize_mime(raw):
+    raw = raw.lower()
+    for norm in NORMAL_MIME:
+        if raw.startswith(norm):
+            return norm
+
+    # Special cases 
+    if raw.startswith('application/xml'):
+        return 'text/xml'
+    if raw.startswith('application/x-pdf'):
+        return 'application/pdf'
+    return None
+
+
+def test_normalize_mime():
+    assert normalize_mime("asdf") == None
+    assert normalize_mime("application/pdf") == "application/pdf"
+    assert normalize_mime("application/pdf+journal") == "application/pdf"
+    assert normalize_mime("Application/PDF") == "application/pdf"
+    assert normalize_mime("application/p") == None
+    assert normalize_mime("application/xml+stuff") == "text/xml"
+
+
+def parse_cdx_line(raw_cdx):
+
+    cdx = raw_cdx.split()
+    if len(cdx) < 11:
+        return None
+
+    surt = cdx[0]
+    dt = cdx[1]
+    url = cdx[2]
+    mime = normalize_mime(cdx[3])
+    http_status = cdx[4]
+    key = cdx[5]
+    c_size = cdx[8]
+    offset = cdx[9]
+    warc = cdx[10]
+
+    if not (key.isalnum() and c_size.isdigit() and offset.isdigit()
+            and http_status == "200" and len(key) == 32 and dt.isdigit()
+            and mime != None):
+        return None
+
+    if '-' in (surt, dt, url, mime, http_status, key, c_size, offset, warc):
+        return None
+
+    key = "sha1:{}".format(key)
+
+    info = dict(surt=surt, dt=dt, url=url, c_size=int(c_size),
+        offset=int(offset), warc=warc)
+
+    warc_file = warc.split('/')[-1]
+    try:
+        dt_iso = datetime.strptime(dt, "%Y%m%d%H%M%S").isoformat()
+    except:
+        return None
+
+    # 'i' intentionally not set
+    heritrix = dict(u=url, d=dt_iso, f=warc_file, o=int(offset), c=1)
+    return {'key': key, 'file:mime': mime, 'file:cdx': info, 'f:c': heritrix}
diff --git a/mapreduce/extraction_cdx_grobid.py b/mapreduce/extraction_cdx_grobid.py
index 27668ea..c102a59 100755
--- a/mapreduce/extraction_cdx_grobid.py
+++ b/mapreduce/extraction_cdx_grobid.py
@@ -24,49 +24,10 @@ from wayback.resource import Resource
 from wayback.resource import ArcResource
 from wayback.resourcestore import ResourceStore
 from gwb.loader import CDXLoaderFactory
+from common import parse_cdx_line
 
 
-def parse_cdx_line(raw_cdx):
-
-    cdx = raw_cdx.split()
-    if len(cdx) < 11:
-        return None
-
-    surt = cdx[0]
-    dt = cdx[1]
-    url = cdx[2]
-    mime = normalize_mime(cdx[3])
-    http_status = cdx[4]
-    key = cdx[5]
-    c_size = cdx[8]
-    offset = cdx[9]
-    warc = cdx[10]
-
-    if not (key.isalnum() and c_size.isdigit() and offset.isdigit()
-            and http_status == "200" and len(key) == 32 and dt.isdigit()):
-        return None
-
-    if '-' in (surt, dt, url, mime, http_status, key, c_size, offset, warc):
-        return None
-
-    key = "sha1:{}".format(key)
-
-    info = dict(surt=surt, dt=dt, url=url, c_size=int(c_size),
-        offset=int(offset), warc=warc)
-
-    warc_file = warc.split('/')[-1]
-    dt_iso = datetime.strptime(dt, "%Y%m%d%H%M%S").isoformat()
-    try:
-        dt_iso = datetime.strptime(dt, "%Y%m%d%H%M%S").isoformat()
-    except:
-        return None
-
-    # 'i' intentionally not set
-    heritrix = dict(u=url, d=dt_iso, f=warc_file, o=int(offset), c=1)
-    return {'key': key, 'file:mime': mime, 'file:cdx': info, 'f:c': heritrix}
-
-
-class MRExtractCdxGrobid(MrJob):
+class MRExtractCdxGrobid(MRJob):
 
     # CDX lines in; JSON status out
     INPUT_PROTOCOL = mrjob.protocol.RawValueProtocol
@@ -204,21 +165,24 @@ class MRExtractCdxGrobid(MrJob):
         info, status = self.parse_line(raw_cdx)
         if info is None:
             self.increment_counter('lines', status['status'])
-            return _, status
+            yield _, status
+            return
 
         # Check if we've already processed this line
-        oldrow = self.hb_table.get(info['key'], columns=['f', 'file',
+        oldrow = self.hb_table.row(info['key'], columns=['f', 'file',
             'grobid:status_code'])
-        if row.get('grobid0:status', None):
+        if oldrow.get('grobid0:status', None):
             # This file has already been processed; skip it
             self.increment_counter('lines', 'existing')
-            return _, dict(status="existing")
+            yield _, dict(status="existing")
+            return
 
         # Do the extraction
         info, status = self.extract(info)
         if info is None:
             self.increment_counter('lines', status['status'])
-            return _, status
+            yield _, status
+            return
 
         # Decide what to bother inserting back into HBase
         # Particularly: ('f:c', 'file:mime', 'file:size', 'file:cdx')
diff --git a/mapreduce/tests/test_common.py b/mapreduce/tests/test_common.py
new file mode 100644
index 0000000..e2f96bb
--- /dev/null
+++ b/mapreduce/tests/test_common.py
@@ -0,0 +1,30 @@
+
+from common import *
+
+
+def test_parse_cdx_line():
+
+    raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz"
+    correct = {
+        'key': "sha1:WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G",
+        'file:mime': "application/pdf",
+        'file:cdx': {
+            'surt': "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf",
+            'url': "https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf",
+            'dt': "20170828233154",
+            'warc': "SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz",
+            'offset': 931661233,
+            'c_size': 210251,
+        },
+        'f:c': {
+            'u': "https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf",
+            'd': "2017-08-28T23:31:54",
+            'f': "SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz",
+            'o': 931661233,
+            'c': 1,
+        }
+    }
+
+    assert parse_cdx_line(raw) == correct
+    assert parse_cdx_line(raw + "\n") == correct
+    assert parse_cdx_line(raw + " extra_field") == correct
diff --git a/mapreduce/tests/test_extraction_cdx_grobid.py b/mapreduce/tests/test_extraction_cdx_grobid.py
index 71b55a3..514a830 100644
--- a/mapreduce/tests/test_extraction_cdx_grobid.py
+++ b/mapreduce/tests/test_extraction_cdx_grobid.py
@@ -5,36 +5,8 @@ import pytest
 import mrjob
 import responses
 import happybase_mock
-from extraction_cdx_grobid import MRExtractCDXGROBID
-
-
-
-def test_parse_cdx_line():
-
-    raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz"
-    correct = {
-        'key': "sha1:WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G",
-        'file:mime': "application/pdf",
-        'file:cdx': {
-            'surt': "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf",
-            'url': "https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf",
-            'dt': "20170828233154",
-            'warc': "SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz",
-            'offset': 931661233,
-            'c_size': 210251,
-        },
-        'f:c': {
-            'u': "https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf",
-            'd': "2017-08-28T23:31:54",
-            'f': "SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz",
-            'o': 931661233,
-            'c': 1,
-        }
-    }
-
-    assert transform_line(raw) == correct
-    assert transform_line(raw + "\n") == correct
-    assert transform_line(raw + " extra_field") == correct
+from extraction_cdx_grobid import MRExtractCdxGrobid
+
 
 @pytest.fixture
 def job():
@@ -47,7 +19,7 @@ def job():
         {'file': {}, 'grobid0': {}, 'f': {}})
     table = conn.table('wbgrp-journal-extract-test')
 
-    job = MRCDXBackfillHBase(['--no-conf', '-'], hb_table=table)
+    job = MRExtractCdxGrobid(['--no-conf', '-'], hb_table=table)
     return job
 
 
@@ -65,6 +37,8 @@ com,pbworks,educ333b)/robots.txt 20170705063311 http://educ333b.pbworks.com/robo
 """)
 
     job.sandbox(stdin=raw)
+
+    pytest.skip("need to mock wayback fetch")
     job.run_mapper()
 
     # wayback gets FETCH 1x times
@@ -97,46 +71,47 @@ com,pbworks,educ333b)/robots.txt 20170705063311 http://educ333b.pbworks.com/robo
 def test_parse_cdx_invalid(job):
 
     print("valid")
-    raw = io.BytesIO(b"com,sagepub,cep)/content/28/9/960.full.pdf 20170705062200 http://cep.sagepub.com/content/28/9/960.full.pdf application/pdf 200 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 401 313356621 CITESEERX-CRAWL-2017-06-20-20170705061647307-00039-00048-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705062052659-00043-31209~wbgrp-svc284.us.archive.org~8443.warc.gz")
+    raw = "com,sagepub,cep)/content/28/9/960.full.pdf 20170705062200 http://cep.sagepub.com/content/28/9/960.full.pdf application/pdf 200 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 401 313356621 CITESEERX-CRAWL-2017-06-20-20170705061647307-00039-00048-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705062052659-00043-31209~wbgrp-svc284.us.archive.org~8443.warc.gz"
     info, status = job.parse_line(raw)
     assert status is None
 
     print("space-prefixed line")
-    raw = io.BytesIO(b" com,sagepub,cep)/content/28/9/960.full.pdf 20170705062200 http://cep.sagepub.com/content/28/9/960.full.pdf application/pdf 200 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 401 313356621 CITESEERX-CRAWL-2017-06-20-20170705061647307-00039-00048-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705062052659-00043-31209~wbgrp-svc284.us.archive.org~8443.warc.gz")
+    raw = " com,sagepub,cep)/content/28/9/960.full.pdf 20170705062200 http://cep.sagepub.com/content/28/9/960.full.pdf application/pdf 200 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 401 313356621 CITESEERX-CRAWL-2017-06-20-20170705061647307-00039-00048-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705062052659-00043-31209~wbgrp-svc284.us.archive.org~8443.warc.gz"
     info, status = job.parse_line(raw)
     assert info is None
     assert status['status'] == "invalid"
     assert 'prefix' in status['reason']
 
     print("commented line")
-    raw = io.BytesIO(b"#com,sagepub,cep)/content/28/9/960.full.pdf 20170705062200 http://cep.sagepub.com/content/28/9/960.full.pdf application/pdf 200 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 401 313356621 CITESEERX-CRAWL-2017-06-20-20170705061647307-00039-00048-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705062052659-00043-31209~wbgrp-svc284.us.archive.org~8443.warc.gz")
+    raw = "#com,sagepub,cep)/content/28/9/960.full.pdf 20170705062200 http://cep.sagepub.com/content/28/9/960.full.pdf application/pdf 200 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 401 313356621 CITESEERX-CRAWL-2017-06-20-20170705061647307-00039-00048-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705062052659-00043-31209~wbgrp-svc284.us.archive.org~8443.warc.gz"
     info, status = job.parse_line(raw)
     assert info is None
     assert status['status'] == "invalid"
     assert 'prefix' in status['reason']
 
     print("wrong column count")
-    raw = io.BytesIO(b"a b c d")
+    raw = "a b c d"
     info, status = job.parse_line(raw)
     assert info is None
     assert status['status'] == "invalid"
     assert 'parse' in status['reason']
 
     print("missing mimetype")
-    raw = io.BytesIO(b"com,sagepub,cep)/content/28/9/960.full.pdf 20170705062200 http://cep.sagepub.com/content/28/9/960.full.pdf application/pdf 200 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 401 313356621 CITESEERX-CRAWL-2017-06-20-20170705061647307-00039-00048-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705062052659-00043-31209~wbgrp-svc284.us.archive.org~8443.warc.gz")
+    raw = "com,sagepub,cep)/content/28/9/960.full.pdf 20170705062200 http://cep.sagepub.com/content/28/9/960.full.pdf - 200 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 401 313356621 CITESEERX-CRAWL-2017-06-20-20170705061647307-00039-00048-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705062052659-00043-31209~wbgrp-svc284.us.archive.org~8443.warc.gz"
     info, status = job.parse_line(raw)
     assert info is None
+    print(status)
     assert status['status'] == "invalid"
     assert 'parse' in status['reason']
 
     print("HTTP status")
-    raw = io.BytesIO(b"com,sagepub,cep)/content/28/9/960.full.pdf 20170705062200 http://cep.sagepub.com/content/28/9/960.full.pdf application/pdf 501 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 401 313356621 CITESEERX-CRAWL-2017-06-20-20170705061647307-00039-00048-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705062052659-00043-31209~wbgrp-svc284.us.archive.org~8443.warc.gz")
+    raw = "com,sagepub,cep)/content/28/9/960.full.pdf 20170705062200 http://cep.sagepub.com/content/28/9/960.full.pdf application/pdf 501 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 401 313356621 CITESEERX-CRAWL-2017-06-20-20170705061647307-00039-00048-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705062052659-00043-31209~wbgrp-svc284.us.archive.org~8443.warc.gz"
     info, status = job.parse_line(raw)
     assert info is None
     assert status['status'] == "invalid"
 
     print("datetime")
-    raw = io.BytesIO(b"com,sagepub,cep)/content/28/9/960.full.pdf 20170705 http://cep.sagepub.com/content/28/9/960.full.pdf application/pdf 501 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 401 313356621 CITESEERX-CRAWL-2017-06-20-20170705061647307-00039-00048-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705062052659-00043-31209~wbgrp-svc284.us.archive.org~8443.warc.gz")
+    raw = "com,sagepub,cep)/content/28/9/960.full.pdf 20170705 http://cep.sagepub.com/content/28/9/960.full.pdf application/pdf 501 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 401 313356621 CITESEERX-CRAWL-2017-06-20-20170705061647307-00039-00048-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705062052659-00043-31209~wbgrp-svc284.us.archive.org~8443.warc.gz"
     info, status = job.parse_line(raw)
     assert info is None
     assert status['status'] == "invalid"
@@ -144,16 +119,19 @@ def test_parse_cdx_invalid(job):
 
 def test_parse_cdx_skip(job):
 
+    job.mapper_init()
+
+
     print("warc format")
-    raw = io.BytesIO(b"com,sagepub,cep)/content/28/9/960.full.pdf 20170705062200 http://cep.sagepub.com/content/28/9/960.full.pdf application/pdf 200 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 401 313356621 CITESEERX-CRAWL-2017-06-20-20170705062052659-00043-31209~wbgrp-svc284.us.archive.org~8443.warc.gz")
-    info, status = job.mapper(raw)
+    raw = "com,sagepub,cep)/content/28/9/960.full.pdf 20170705062200 http://cep.sagepub.com/content/28/9/960.full.pdf application/pdf 200 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 401 313356621 CITESEERX-CRAWL-2017-06-20-20170705062052659-00043-31209~wbgrp-svc284.us.archive.org~8443.warc.gz"
+    info, status = job.mapper(None, raw).__next__()
     assert info is None
     assert status['status'] == "skip"
     assert 'WARC' in status['reason']
 
     print("mimetype")
-    raw = io.BytesIO(b"com,sagepub,cep)/content/28/9/960.full.pdf 20170705 http://cep.sagepub.com/content/28/9/960.full.pdf application/pdf 200 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 401 313356621 CITESEERX-CRAWL-2017-06-20-20170705061647307-00039-00048-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705062052659-00043-31209~wbgrp-svc284.us.archive.org~8443.warc.gz")
-    info, status = job.mapper(raw)
+    raw = "com,sagepub,cep)/content/28/9/960.full.pdf 20170705062200 http://cep.sagepub.com/content/28/9/960.full.pdf text/html 200 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 401 313356621 CITESEERX-CRAWL-2017-06-20-20170705061647307-00039-00048-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705062052659-00043-31209~wbgrp-svc284.us.archive.org~8443.warc.gz"
+    info, status = job.mapper(None, raw).__next__()
     assert info is None
     assert status['status'] == "skip"
     assert 'mimetype' in status['reason']
author	Bryan Newbold <bnewbold@archive.org>	2018-04-04 13:31:25 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2018-04-04 13:31:44 -0700
commit	7ecb1334506cab470399d9f493e5d8a651c9c2cc (patch)
tree	713d65e2e2e89d08f0dd93c052445002512c4769 /mapreduce
parent	1dad0d9e54bfae93eebea47f8a3cb291cdd645c5 (diff)
download	sandcrawler-7ecb1334506cab470399d9f493e5d8a651c9c2cc.tar.gz sandcrawler-7ecb1334506cab470399d9f493e5d8a651c9c2cc.zip