diff options
author | Bryan Newbold <bnewbold@archive.org> | 2019-02-21 11:55:16 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2019-02-21 11:55:20 -0800 |
commit | 4225fe89836b72e771e612139d0f5561088a6909 (patch) | |
tree | 3a6a5a24735e1b4212419bdca01a96392be5130e /python/kafka_grobid.py | |
parent | f59e9895d3c9d198538b40e36d3b0cc3b4bb5b92 (diff) | |
download | sandcrawler-4225fe89836b72e771e612139d0f5561088a6909.tar.gz sandcrawler-4225fe89836b72e771e612139d0f5561088a6909.zip |
backport GWB fetch improvements to extraction/kafka workers
*Really* need to refactor out these common methods into a base class.
Diffstat (limited to 'python/kafka_grobid.py')
-rwxr-xr-x | python/kafka_grobid.py | 12 |
1 files changed, 8 insertions, 4 deletions
diff --git a/python/kafka_grobid.py b/python/kafka_grobid.py index 17908e5..ba84eee 100755 --- a/python/kafka_grobid.py +++ b/python/kafka_grobid.py @@ -37,13 +37,11 @@ import xml import json import raven import struct -import requests import argparse +import requests import pykafka import wayback.exception from http.client import IncompleteRead -from wayback.resource import Resource -from wayback.resource import ArcResource from wayback.resourcestore import ResourceStore from gwb.loader import CDXLoaderFactory @@ -66,6 +64,10 @@ class KafkaGrobidWorker: self.consumer_group = kwargs.get('consumer_group', 'grobid-extraction') self.kafka_hosts = kafka_hosts or 'localhost:9092' self.grobid_uri = kwargs.get('grobid_uri') + # /serve/ instead of /download/ doesn't record view count + self.petabox_base_url = kwargs.get('petabox_base_url', 'http://archive.org/serve/') + # gwb library will fall back to reading from /opt/.petabox/webdata.secret + self.petabox_webdata_secret = kwargs.get('petabox_webdata_secret', os.environ.get('PETABOX_WEBDATA_SECRET')) self.warc_uri_prefix = kwargs.get('warc_uri_prefix') self.mime_filter = ['application/pdf'] self.rstore = None @@ -104,7 +106,9 @@ class KafkaGrobidWorker: def fetch_warc_content(self, warc_path, offset, c_size): warc_uri = self.warc_uri_prefix + warc_path if not self.rstore: - self.rstore = ResourceStore(loaderfactory=CDXLoaderFactory()) + self.rstore = ResourceStore(loaderfactory=CDXLoaderFactory( + webdata_secret=self.petabox_webdata_secret, + download_base_url=self.petabox_base_url)) try: gwb_record = self.rstore.load_resource(warc_uri, offset, c_size) except wayback.exception.ResourceUnavailable: |