aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
Diffstat (limited to 'python')
-rwxr-xr-xpython/extraction_cdx_grobid.py5
-rwxr-xr-xpython/extraction_ungrobided.py5
-rwxr-xr-xpython/kafka_grobid.py1
-rw-r--r--python/tests/test_extraction_cdx_grobid.py2
4 files changed, 8 insertions, 5 deletions
diff --git a/python/extraction_cdx_grobid.py b/python/extraction_cdx_grobid.py
index 01d566e..88580e1 100755
--- a/python/extraction_cdx_grobid.py
+++ b/python/extraction_cdx_grobid.py
@@ -17,6 +17,7 @@ Requires:
# in `wayback` library. Means we can't run pylint.
# pylint: skip-file
+import os
import xml
import json
import raven
@@ -119,11 +120,11 @@ class MRExtractCdxGrobid(MRJob):
return info, None
def fetch_warc_content(self, warc_path, offset, c_size):
- warc_uri = self.warc_uri_prefix + warc_path
+ warc_uri = self.options.warc_uri_prefix + warc_path
if not self.rstore:
self.rstore = ResourceStore(loaderfactory=CDXLoaderFactory(
webdata_secret=self.petabox_webdata_secret,
- download_base_url=self.petabox_base_url))
+ download_base_url=self.options.warc_uri_prefix))
try:
gwb_record = self.rstore.load_resource(warc_uri, offset, c_size)
except wayback.exception.ResourceUnavailable:
diff --git a/python/extraction_ungrobided.py b/python/extraction_ungrobided.py
index 99d4f13..225e46f 100755
--- a/python/extraction_ungrobided.py
+++ b/python/extraction_ungrobided.py
@@ -17,6 +17,7 @@ Requires:
# in `wayback` library. Means we can't run pylint.
# pylint: skip-file
+import os
import xml
import json
import raven
@@ -122,11 +123,11 @@ class MRExtractUnGrobided(MRJob):
return info, None
def fetch_warc_content(self, warc_path, offset, c_size):
- warc_uri = self.warc_uri_prefix + warc_path
+ warc_uri = self.options.warc_uri_prefix + warc_path
if not self.rstore:
self.rstore = ResourceStore(loaderfactory=CDXLoaderFactory(
webdata_secret=self.petabox_webdata_secret,
- download_base_url=self.petabox_base_url))
+ download_base_url=self.options.warc_uri_prefix))
try:
gwb_record = self.rstore.load_resource(warc_uri, offset, c_size)
except wayback.exception.ResourceUnavailable:
diff --git a/python/kafka_grobid.py b/python/kafka_grobid.py
index ba84eee..cde7a2d 100755
--- a/python/kafka_grobid.py
+++ b/python/kafka_grobid.py
@@ -32,6 +32,7 @@ Requires:
# in `wayback` library. Means we can't run pylint.
# pylint: skip-file
+import os
import sys
import xml
import json
diff --git a/python/tests/test_extraction_cdx_grobid.py b/python/tests/test_extraction_cdx_grobid.py
index 1bf2420..471d94a 100644
--- a/python/tests/test_extraction_cdx_grobid.py
+++ b/python/tests/test_extraction_cdx_grobid.py
@@ -8,7 +8,7 @@ import responses
import happybase_mock
import wayback.exception
from unittest import mock
-from extraction_cdx_grobid import MRExtractCdxGrobid, Resource
+from extraction_cdx_grobid import MRExtractCdxGrobid
FAKE_PDF_BYTES = b"%PDF SOME JUNK" + struct.pack("!q", 112853843)