aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-01-15 13:52:42 -0800
committerBryan Newbold <bnewbold@archive.org>2020-01-15 13:52:42 -0800
commit4d0224f3e73315ef4db39643e6d4851e4a466658 (patch)
tree43f22e098e565ae67a515d1a17bca5d6f632ae76
parent2d052b610ed02341aebab865f174671f8381146e (diff)
downloadsandcrawler-4d0224f3e73315ef4db39643e6d4851e4a466658.tar.gz
sandcrawler-4d0224f3e73315ef4db39643e6d4851e4a466658.zip
pass through revisit_cdx
-rw-r--r--python/sandcrawler/ia.py23
-rw-r--r--python/sandcrawler/ingest.py3
2 files changed, 21 insertions, 5 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index 08d1152..29991df 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -25,6 +25,7 @@ ResourceResult = namedtuple("ResourceResult", [
"terminal_status_code",
"body",
"cdx",
+ "revisit_cdx",
])
WarcResource = namedtuple("WarcResource", [
@@ -505,14 +506,18 @@ class WaybackClient:
terminal_status_code=None,
body=None,
cdx=None,
+ revisit_cdx=None,
)
if cdx_row.status_code in (200, 226):
+ revisit_cdx = None
if '/' in cdx_row.warc_path:
- body = self.fetch_petabox_body(
+ resource = self.fetch_petabox(
csize=cdx_row.warc_csize,
offset=cdx_row.warc_offset,
warc_path=cdx_row.warc_path,
)
+ body = resource.body
+ revisit_cdx = resource.revisit_cdx
else:
body = self.fetch_replay_body(
url=cdx_row.url,
@@ -528,6 +533,7 @@ class WaybackClient:
terminal_status_code=cdx_row.status_code,
body=body,
cdx=cdx_row,
+ revisit_cdx=revisit_cdx,
)
elif 300 <= (cdx_row.status_code or 0) < 400:
if '/' in cdx_row.warc_path:
@@ -558,6 +564,7 @@ class WaybackClient:
terminal_status_code=cdx_row.status_code,
body=None,
cdx=cdx_row,
+ revisit_cdx=None,
)
if next_url in urls_seen:
return ResourceResult(
@@ -569,6 +576,7 @@ class WaybackClient:
terminal_status_code=cdx_row.status_code,
body=None,
cdx=cdx_row,
+ revisit_cdx=None,
)
urls_seen.append(next_url)
continue
@@ -582,6 +590,7 @@ class WaybackClient:
terminal_status_code=cdx_row.status_code,
body=None,
cdx=cdx_row,
+ revisit_cdx=None,
)
return ResourceResult(
start_url=start_url,
@@ -592,6 +601,7 @@ class WaybackClient:
terminal_status_code=cdx_row.status_code,
body=None,
cdx=cdx_row,
+ revisit_cdx=None,
)
@@ -758,6 +768,7 @@ class SavePageNowClient:
terminal_status_code=None,
body=None,
cdx=None,
+ revisit_cdx=None,
)
#print(spn_result, file=sys.stderr)
@@ -798,11 +809,12 @@ class SavePageNowClient:
terminal_status_code=None,
body=None,
cdx=None,
+ revisit_cdx=None,
)
#print(cdx_row, file=sys.stderr)
- cdx_ret = cdx_row
+ revisit_cdx = None
if '/' in cdx_row.warc_path:
# Usually can't do this kind of direct fetch because CDX result is recent/live
resource = wayback_client.fetch_petabox(
@@ -813,7 +825,7 @@ class SavePageNowClient:
body = resource.body
if resource.revisit_cdx:
assert resource.revisit_cdx.sha1hex == cdx_row.sha1hex
- cdx_ret = resource.revisit_cdx
+ revisit_cdx = resource.revisit_cdx
else:
# note: currently not trying to verify cdx_row.sha1hex
body = wayback_client.fetch_replay_body(
@@ -821,7 +833,7 @@ class SavePageNowClient:
datetime=cdx_row.datetime,
)
# warc_path etc will change, so strip them out
- cdx_ret = cdx_partial_from_row(cdx_row)
+ cdx_row = cdx_partial_from_row(cdx_row)
return ResourceResult(
start_url=start_url,
@@ -831,6 +843,7 @@ class SavePageNowClient:
terminal_dt=cdx_row.datetime,
terminal_status_code=cdx_row.status_code,
body=body,
- cdx=cdx_ret,
+ cdx=cdx_row,
+ revisit_cdx=revisit_cdx,
)
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index e5eb6e8..de5e957 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -239,6 +239,7 @@ class IngestFileWorker(SandcrawlerWorker):
if not resource.hit:
result['status'] = resource.status
return result
+
file_meta = gen_file_metadata(resource.body)
if "html" in file_meta['mimetype'] or "xml" in file_meta['mimetype']:
@@ -289,6 +290,8 @@ class IngestFileWorker(SandcrawlerWorker):
result['file_meta'] = file_meta
result['cdx'] = cdx_to_dict(resource.cdx)
+ if resource.revisit_cdx:
+ result['revisit_cdx'] = cdx_to_dict(resource.revisit_cdx)
# other failure cases
if not resource.body or file_meta['size_bytes'] == 0: