diff options
author | Bryan Newbold <bnewbold@archive.org> | 2019-11-13 16:42:50 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2019-11-13 16:42:50 -0800 |
commit | e74bf1ea577c8d991eebf4174ae1a1932ad9992f (patch) | |
tree | e8edfe82d702fc93daa864aee127c81f3d6d3336 | |
parent | e456e6a17eb9655afc4dc8146f50a7dba4fd8601 (diff) | |
download | sandcrawler-e74bf1ea577c8d991eebf4174ae1a1932ad9992f.tar.gz sandcrawler-e74bf1ea577c8d991eebf4174ae1a1932ad9992f.zip |
rename FileIngestWorker
-rwxr-xr-x | python/ingest_file.py | 11 | ||||
-rw-r--r-- | python/sandcrawler/__init__.py | 1 | ||||
-rw-r--r-- | python/sandcrawler/ingest.py | 14 |
3 files changed, 16 insertions, 10 deletions
diff --git a/python/ingest_file.py b/python/ingest_file.py index 4fd44ca..5a20aac 100755 --- a/python/ingest_file.py +++ b/python/ingest_file.py @@ -5,7 +5,7 @@ import json import argparse from http.server import HTTPServer -from sandcrawler.ingest import IngestFileRequestHandler, FileIngester +from sandcrawler.ingest import IngestFileRequestHandler, IngestFileWorker def run_single_ingest(args): @@ -14,16 +14,17 @@ def run_single_ingest(args): ext_ids=dict(doi=args.doi), release_id=args.release_id, ) - ingester = FileIngester() - result = ingester.ingest_file(request) + ingester = IngestFileWorker() + result = ingester.process(request) print(json.dumps(result)) return result def run_requests(args): - ingester = FileIngester() + # TODO: switch to using JsonLinePusher + ingester = IngestFileWorker() for l in args.json_file: request = json.loads(l.strip()) - result = ingester.ingest_file(request) + result = ingester.process(request) print(json.dumps(result)) def run_api(args): diff --git a/python/sandcrawler/__init__.py b/python/sandcrawler/__init__.py index 39503fc..e8fbcdf 100644 --- a/python/sandcrawler/__init__.py +++ b/python/sandcrawler/__init__.py @@ -3,4 +3,5 @@ from .grobid import GrobidClient, GrobidWorker, GrobidBlobWorker from .misc import gen_file_metadata, b32_hex, parse_cdx_line, parse_cdx_datetime from .workers import KafkaSink, KafkaGrobidSink, JsonLinePusher, CdxLinePusher, CdxLinePusher, KafkaJsonPusher, BlackholeSink, ZipfilePusher, MultiprocessWrapper from .ia import WaybackClient, WaybackError, CdxApiClient, CdxApiError, SavePageNowClient, SavePageNowError +from .ingest import IngestFileWorker diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index e2dd44c..0bf1bbd 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -9,11 +9,15 @@ from sandcrawler.ia import SavePageNowClient, CdxApiClient, WaybackClient, Wayba from sandcrawler.grobid import GrobidClient from sandcrawler.misc import gen_file_metadata from sandcrawler.html import extract_fulltext_url +from sandcrawler.workers import SandcrawlerWorker -class FileIngester: - def __init__(self, **kwargs): +class IngestFileWorker(SandcrawlerWorker): + + def __init__(self, sink=None, **kwargs): + super().__init__() + self.sink = sink self.spn_client = kwargs.get('spn_client', SavePageNowClient()) self.wayback_client = kwargs.get('wayback_client', @@ -75,7 +79,7 @@ class FileIngester: body = resp.content return (cdx, body) - def ingest_file(self, request): + def process(self, request): """ 1. check sandcrawler-db for base_url -> if found, populate terminal+wayback fields @@ -170,8 +174,8 @@ class IngestFileRequestHandler(BaseHTTPRequestHandler): length = int(self.headers.get('content-length')) request = json.loads(self.rfile.read(length).decode('utf-8')) print("Got request: {}".format(request)) - ingester = FileIngester() - result = ingester.ingest_file(request) + ingester = FileIngestWorker() + result = ingester.process(request) self.send_response(200) self.end_headers() self.wfile.write(json.dumps(result)) |