diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-09-30 15:09:42 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-10-15 18:15:04 -0700 |
commit | 86107e39b761e5b799562af662219fda04ade1be (patch) | |
tree | 9a9801fc1d8a583a792457c2b8468ce6dbea22f1 /python/sandcrawler | |
parent | 7430ddbcdec76091220de474060b968f0ef1bb70 (diff) | |
download | sandcrawler-86107e39b761e5b799562af662219fda04ade1be.tar.gz sandcrawler-86107e39b761e5b799562af662219fda04ade1be.zip |
refactoring; progress on filesets
Diffstat (limited to 'python/sandcrawler')
-rw-r--r-- | python/sandcrawler/__init__.py | 3 | ||||
-rw-r--r-- | python/sandcrawler/ingest_file.py | 5 |
2 files changed, 7 insertions, 1 deletions
diff --git a/python/sandcrawler/__init__.py b/python/sandcrawler/__init__.py index e461462..724a39c 100644 --- a/python/sandcrawler/__init__.py +++ b/python/sandcrawler/__init__.py @@ -4,7 +4,8 @@ from .pdftrio import PdfTrioClient, PdfTrioWorker, PdfTrioBlobWorker from .misc import gen_file_metadata, b32_hex, parse_cdx_line, parse_cdx_datetime, clean_url from .workers import KafkaSink, KafkaCompressSink, JsonLinePusher, CdxLinePusher, CdxLinePusher, KafkaJsonPusher, BlackholeSink, ZipfilePusher, MultiprocessWrapper from .ia import WaybackClient, WaybackError, WaybackContentError, CdxApiClient, CdxApiError, SavePageNowClient, SavePageNowError, PetaboxError, ResourceResult, WarcResource, CdxPartial, CdxRow -from .ingest import IngestFileWorker +from .ingest_file import IngestFileWorker +from .ingest_fileset import IngestFilesetWorker from .persist import PersistCdxWorker, PersistIngestFileResultWorker, PersistGrobidWorker, PersistGrobidDiskWorker, PersistPdfTrioWorker, PersistIngestRequestWorker, PersistPdfTextWorker, PersistThumbnailWorker from .db import SandcrawlerPostgrestClient, SandcrawlerPostgresClient from .pdfextract import PdfExtractWorker, PdfExtractBlobWorker diff --git a/python/sandcrawler/ingest_file.py b/python/sandcrawler/ingest_file.py index b852c69..a02e923 100644 --- a/python/sandcrawler/ingest_file.py +++ b/python/sandcrawler/ingest_file.py @@ -25,6 +25,8 @@ from sandcrawler.workers import SandcrawlerWorker from sandcrawler.db import SandcrawlerPostgrestClient from sandcrawler.xml import xml_reserialize +from sandcrawler.platforms.generic import DirectFileHelper + MAX_BODY_SIZE_BYTES = 128*1024*1024 @@ -520,6 +522,9 @@ class IngestFileWorker(SandcrawlerWorker): return True def process(self, request: dict, key: Any = None) -> dict: + return self.process_file(request, key=key) + + def process_file(self, request: dict, key: Any = None) -> dict: # old backwards compatibility if request.get('ingest_type') == 'file': |