From 0fa705d4e25e329bd976c399acb58cb95d0d424f Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 21 Feb 2019 12:25:30 -0800 Subject: python test fixes --- python/extraction_cdx_grobid.py | 5 +++-- python/extraction_ungrobided.py | 5 +++-- python/kafka_grobid.py | 1 + python/tests/test_extraction_cdx_grobid.py | 2 +- 4 files changed, 8 insertions(+), 5 deletions(-) (limited to 'python') diff --git a/python/extraction_cdx_grobid.py b/python/extraction_cdx_grobid.py index 01d566e..88580e1 100755 --- a/python/extraction_cdx_grobid.py +++ b/python/extraction_cdx_grobid.py @@ -17,6 +17,7 @@ Requires: # in `wayback` library. Means we can't run pylint. # pylint: skip-file +import os import xml import json import raven @@ -119,11 +120,11 @@ class MRExtractCdxGrobid(MRJob): return info, None def fetch_warc_content(self, warc_path, offset, c_size): - warc_uri = self.warc_uri_prefix + warc_path + warc_uri = self.options.warc_uri_prefix + warc_path if not self.rstore: self.rstore = ResourceStore(loaderfactory=CDXLoaderFactory( webdata_secret=self.petabox_webdata_secret, - download_base_url=self.petabox_base_url)) + download_base_url=self.options.warc_uri_prefix)) try: gwb_record = self.rstore.load_resource(warc_uri, offset, c_size) except wayback.exception.ResourceUnavailable: diff --git a/python/extraction_ungrobided.py b/python/extraction_ungrobided.py index 99d4f13..225e46f 100755 --- a/python/extraction_ungrobided.py +++ b/python/extraction_ungrobided.py @@ -17,6 +17,7 @@ Requires: # in `wayback` library. Means we can't run pylint. # pylint: skip-file +import os import xml import json import raven @@ -122,11 +123,11 @@ class MRExtractUnGrobided(MRJob): return info, None def fetch_warc_content(self, warc_path, offset, c_size): - warc_uri = self.warc_uri_prefix + warc_path + warc_uri = self.options.warc_uri_prefix + warc_path if not self.rstore: self.rstore = ResourceStore(loaderfactory=CDXLoaderFactory( webdata_secret=self.petabox_webdata_secret, - download_base_url=self.petabox_base_url)) + download_base_url=self.options.warc_uri_prefix)) try: gwb_record = self.rstore.load_resource(warc_uri, offset, c_size) except wayback.exception.ResourceUnavailable: diff --git a/python/kafka_grobid.py b/python/kafka_grobid.py index ba84eee..cde7a2d 100755 --- a/python/kafka_grobid.py +++ b/python/kafka_grobid.py @@ -32,6 +32,7 @@ Requires: # in `wayback` library. Means we can't run pylint. # pylint: skip-file +import os import sys import xml import json diff --git a/python/tests/test_extraction_cdx_grobid.py b/python/tests/test_extraction_cdx_grobid.py index 1bf2420..471d94a 100644 --- a/python/tests/test_extraction_cdx_grobid.py +++ b/python/tests/test_extraction_cdx_grobid.py @@ -8,7 +8,7 @@ import responses import happybase_mock import wayback.exception from unittest import mock -from extraction_cdx_grobid import MRExtractCdxGrobid, Resource +from extraction_cdx_grobid import MRExtractCdxGrobid FAKE_PDF_BYTES = b"%PDF SOME JUNK" + struct.pack("!q", 112853843) -- cgit v1.2.3