aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/ingest.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/sandcrawler/ingest.py')
-rw-r--r--python/sandcrawler/ingest.py26
1 files changed, 15 insertions, 11 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index f085fc5..4b6c587 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -6,7 +6,7 @@ import requests
from http.server import BaseHTTPRequestHandler, HTTPServer
from collections import namedtuple
-from sandcrawler.ia import SavePageNowClient, CdxApiClient, WaybackClient, WaybackError, SavePageNowError, CdxApiError, PetaboxError
+from sandcrawler.ia import SavePageNowClient, CdxApiClient, WaybackClient, WaybackError, SavePageNowError, CdxApiError, PetaboxError, cdx_to_dict
from sandcrawler.grobid import GrobidClient
from sandcrawler.misc import gen_file_metadata
from sandcrawler.html import extract_fulltext_url
@@ -54,7 +54,9 @@ class IngestFileWorker(SandcrawlerWorker):
if not self.grobid_client:
self.grobid_client = GrobidClient()
- self.attempt_existing = False
+ self.try_existing_ingest = False
+ self.try_wayback = True
+ self.try_spn2 = True
def check_existing_ingest(self, base_url):
"""
@@ -66,7 +68,7 @@ class IngestFileWorker(SandcrawlerWorker):
Looks at existing ingest results and makes a decision based on, eg,
status and timestamp.
"""
- if not self.attempt_existing:
+ if not self.try_existing_ingest:
return None
raise NotImplementedError
@@ -78,10 +80,16 @@ class IngestFileWorker(SandcrawlerWorker):
Looks in wayback for a resource starting at the URL, following any
redirects. If a hit isn't found, try crawling with SPN.
"""
- resource = self.wayback_client.lookup_resource(url, best_mimetype)
- if not resource or not resource.hit:
+ via = "none"
+ resource = None
+ if self.try_wayback:
+ via = "wayback"
+ resource = self.wayback_client.lookup_resource(url, best_mimetype)
+ if self.try_spn2 and (not resource or not resource.hit):
+ via = "spn2"
resource = self.spn_client.crawl_resource(url, self.wayback_client)
- print("[FETCH] {} url:{}".format(
+ print("[FETCH {}\t] {}\turl:{}".format(
+ via,
resource.status,
url),
file=sys.stderr)
@@ -146,11 +154,6 @@ class IngestFileWorker(SandcrawlerWorker):
try:
# first hop
resource = self.find_resource(base_url, best_mimetype)
- print("[{}] fetch status: {} url: {}".format(
- resource.status,
- ingest_type,
- base_url),
- file=sys.stderr)
if not resource.hit:
result['status'] = resource.status
return result
@@ -218,6 +221,7 @@ class IngestFileWorker(SandcrawlerWorker):
result['status'] = "success"
result['hit'] = True
+ result['cdx'] = cdx_to_dict(resource.cdx)
return result