aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/ingest_file.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/sandcrawler/ingest_file.py')
-rw-r--r--python/sandcrawler/ingest_file.py33
1 files changed, 21 insertions, 12 deletions
diff --git a/python/sandcrawler/ingest_file.py b/python/sandcrawler/ingest_file.py
index 281c6d3..9faf98b 100644
--- a/python/sandcrawler/ingest_file.py
+++ b/python/sandcrawler/ingest_file.py
@@ -53,23 +53,32 @@ class IngestFileWorker(SandcrawlerWorker):
process_file_hit(ResourceResult) -> response
process_grobid(ResourceResult)
"""
- def __init__(self, sink=None, **kwargs):
+ def __init__(self, sink: Optional[SandcrawlerWorker] = None, **kwargs):
super().__init__()
self.sink = sink
- self.wayback_client = kwargs.get('wayback_client')
- if not self.wayback_client:
+
+ if kwargs.get('wayback_client'):
+ self.wayback_client: WaybackClient = kwargs['wayback_client']
+ else:
self.wayback_client = WaybackClient()
- self.spn_client = kwargs.get('spn_client')
- if not self.spn_client:
+
+ if kwargs.get('spn_client'):
+ self.spn_client: SavePageNowClient = kwargs['spn_client']
+ else:
self.spn_client = SavePageNowClient(
spn_cdx_retry_sec=kwargs.get('spn_cdx_retry_sec', 9.0))
- self.grobid_client = kwargs.get('grobid_client')
- if not self.grobid_client:
+
+ if kwargs.get('grobid_client'):
+ self.grobid_client: GrobidClient = kwargs['grobid_client']
+ else:
self.grobid_client = GrobidClient()
- self.pgrest_client = kwargs.get('pgrest_client')
- if not self.pgrest_client:
+
+ if kwargs.get('pgrest_client'):
+ self.pgrest_client: SandcrawlerPostgrestClient = kwargs['pgrest_client']
+ else:
self.pgrest_client = SandcrawlerPostgrestClient()
+
self.grobid_sink = kwargs.get('grobid_sink')
self.thumbnail_sink = kwargs.get('thumbnail_sink')
self.pdftext_sink = kwargs.get('pdftext_sink')
@@ -213,9 +222,9 @@ class IngestFileWorker(SandcrawlerWorker):
return None
def find_resource(self,
- url,
- best_mimetype=None,
- force_recrawl=False) -> Optional[ResourceResult]:
+ url: str,
+ best_mimetype: Optional[str] = None,
+ force_recrawl: bool = False) -> Optional[ResourceResult]:
"""
Looks in wayback for a resource starting at the URL, following any
redirects. If a hit isn't found, try crawling with SPN.