aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
Diffstat (limited to 'python')
-rwxr-xr-xpython/ingest_file.py11
-rw-r--r--python/sandcrawler/__init__.py1
-rw-r--r--python/sandcrawler/ingest.py14
3 files changed, 16 insertions, 10 deletions
diff --git a/python/ingest_file.py b/python/ingest_file.py
index 4fd44ca..5a20aac 100755
--- a/python/ingest_file.py
+++ b/python/ingest_file.py
@@ -5,7 +5,7 @@ import json
import argparse
from http.server import HTTPServer
-from sandcrawler.ingest import IngestFileRequestHandler, FileIngester
+from sandcrawler.ingest import IngestFileRequestHandler, IngestFileWorker
def run_single_ingest(args):
@@ -14,16 +14,17 @@ def run_single_ingest(args):
ext_ids=dict(doi=args.doi),
release_id=args.release_id,
)
- ingester = FileIngester()
- result = ingester.ingest_file(request)
+ ingester = IngestFileWorker()
+ result = ingester.process(request)
print(json.dumps(result))
return result
def run_requests(args):
- ingester = FileIngester()
+ # TODO: switch to using JsonLinePusher
+ ingester = IngestFileWorker()
for l in args.json_file:
request = json.loads(l.strip())
- result = ingester.ingest_file(request)
+ result = ingester.process(request)
print(json.dumps(result))
def run_api(args):
diff --git a/python/sandcrawler/__init__.py b/python/sandcrawler/__init__.py
index 39503fc..e8fbcdf 100644
--- a/python/sandcrawler/__init__.py
+++ b/python/sandcrawler/__init__.py
@@ -3,4 +3,5 @@ from .grobid import GrobidClient, GrobidWorker, GrobidBlobWorker
from .misc import gen_file_metadata, b32_hex, parse_cdx_line, parse_cdx_datetime
from .workers import KafkaSink, KafkaGrobidSink, JsonLinePusher, CdxLinePusher, CdxLinePusher, KafkaJsonPusher, BlackholeSink, ZipfilePusher, MultiprocessWrapper
from .ia import WaybackClient, WaybackError, CdxApiClient, CdxApiError, SavePageNowClient, SavePageNowError
+from .ingest import IngestFileWorker
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index e2dd44c..0bf1bbd 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -9,11 +9,15 @@ from sandcrawler.ia import SavePageNowClient, CdxApiClient, WaybackClient, Wayba
from sandcrawler.grobid import GrobidClient
from sandcrawler.misc import gen_file_metadata
from sandcrawler.html import extract_fulltext_url
+from sandcrawler.workers import SandcrawlerWorker
-class FileIngester:
- def __init__(self, **kwargs):
+class IngestFileWorker(SandcrawlerWorker):
+
+ def __init__(self, sink=None, **kwargs):
+ super().__init__()
+ self.sink = sink
self.spn_client = kwargs.get('spn_client',
SavePageNowClient())
self.wayback_client = kwargs.get('wayback_client',
@@ -75,7 +79,7 @@ class FileIngester:
body = resp.content
return (cdx, body)
- def ingest_file(self, request):
+ def process(self, request):
"""
1. check sandcrawler-db for base_url
-> if found, populate terminal+wayback fields
@@ -170,8 +174,8 @@ class IngestFileRequestHandler(BaseHTTPRequestHandler):
length = int(self.headers.get('content-length'))
request = json.loads(self.rfile.read(length).decode('utf-8'))
print("Got request: {}".format(request))
- ingester = FileIngester()
- result = ingester.ingest_file(request)
+ ingester = FileIngestWorker()
+ result = ingester.process(request)
self.send_response(200)
self.end_headers()
self.wfile.write(json.dumps(result))