diff options
Diffstat (limited to 'python/sandcrawler/ingest.py')
-rw-r--r-- | python/sandcrawler/ingest.py | 16 |
1 files changed, 15 insertions, 1 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index 5dc5b55..c9a697c 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -93,6 +93,15 @@ class IngestFileWorker(SandcrawlerWorker): "digital.ucd.ie/", # ireland national historical ] + # these are special-case web domains for which we want SPN2 to not run + # a headless browser (brozzler), but instead simply run wget. + # the motivation could be to work around browser issues, or in the + # future possibly to increase download efficiency (wget/fetch being + # faster than browser fetch) + self.spn2_simple_get_domains = [ + ] + + def check_existing_ingest(self, base_url): """ Check in sandcrawler-db (postgres) to see if we have already ingested @@ -138,7 +147,12 @@ class IngestFileWorker(SandcrawlerWorker): if self.try_spn2 and (not resource or not resource.hit or soft404): via = "spn2" - resource = self.spn_client.crawl_resource(url, self.wayback_client) + force_get = 0 + for domain in self.spn2_simple_get_domains: + if domain in url: + force_get = 1 + break + resource = self.spn_client.crawl_resource(url, self.wayback_client, force_get=force_get) print("[FETCH {}\t] {}\t{}".format( via, resource.status, |