aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/ia.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-11-03 16:14:33 -0800
committerBryan Newbold <bnewbold@archive.org>2020-11-03 16:14:33 -0800
commita1a4e96e44bfb851003e578defd6f33008be6871 (patch)
tree7696c8450a591faecc704827ec933653c737e26b /python/sandcrawler/ia.py
parent4c46d822463573b824e9dbb2a1acca99f58b6853 (diff)
downloadsandcrawler-a1a4e96e44bfb851003e578defd6f33008be6871.tar.gz
sandcrawler-a1a4e96e44bfb851003e578defd6f33008be6871.zip
ingest: tweak debug printing alignment
Diffstat (limited to 'python/sandcrawler/ia.py')
-rw-r--r--python/sandcrawler/ia.py15
1 files changed, 7 insertions, 8 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index b1f90ea..cca81fa 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -363,7 +363,7 @@ class WaybackClient:
#print("offset: {} csize: {} uri: {}".format(offset, csize, warc_uri), file=sys.stderr)
gwb_record = self.rstore.load_resource(warc_uri, offset, csize)
except wayback.exception.ResourceUnavailable:
- print("Failed to fetch from warc_path:{}".format(warc_path), file=sys.stderr)
+ print(" Failed to fetch from warc_path:{}".format(warc_path), file=sys.stderr)
raise PetaboxError("failed to load file contents from wayback/petabox (ResourceUnavailable)")
except ValueError as ve:
raise PetaboxError("failed to load file contents from wayback/petabox (ValueError: {})".format(ve))
@@ -511,7 +511,7 @@ class WaybackClient:
# TODO: don't need *all* these hashes, just sha1
file_meta = gen_file_metadata(resp.content)
if cdx_sha1hex != file_meta['sha1hex']:
- print("REPLAY MISMATCH: cdx:{} replay:{}".format(
+ print(" REPLAY MISMATCH: cdx:{} replay:{}".format(
cdx_sha1hex,
file_meta['sha1hex']),
file=sys.stderr)
@@ -672,7 +672,7 @@ class WaybackClient:
)
assert 300 <= resource.status_code < 400
if not resource.location:
- print("bad redirect record: {}".format(cdx_row), file=sys.stderr)
+ print(" bad redirect record: {}".format(cdx_row), file=sys.stderr)
return ResourceResult(
start_url=start_url,
hit=False,
@@ -701,7 +701,7 @@ class WaybackClient:
next_url = clean_url(next_url)
cdx_row = cdx_partial_from_row(cdx_row)
if not next_url:
- print("bad redirect record: {}".format(cdx_row), file=sys.stderr)
+ print(" bad redirect record: {}".format(cdx_row), file=sys.stderr)
return ResourceResult(
start_url=start_url,
hit=False,
@@ -982,13 +982,12 @@ class SavePageNowClient:
elsevier_pdf_cdx = wayback_client.cdx_client.lookup_best(
spn_result.request_url,
best_mimetype="application/pdf",
- closest=closest,
)
if elsevier_pdf_cdx and elsevier_pdf_cdx.mimetype == "application/pdf":
- print("Trying pdf.sciencedirectassets.com hack!", file=sys.stderr)
+ print(" Trying pdf.sciencedirectassets.com hack!", file=sys.stderr)
cdx_row = elsevier_pdf_cdx
else:
- print("Failed pdf.sciencedirectassets.com hack!", file=sys.stderr)
+ print(" Failed pdf.sciencedirectassets.com hack!", file=sys.stderr)
#print(elsevier_pdf_cdx, file=sys.stderr)
if not cdx_row:
@@ -1004,7 +1003,7 @@ class SavePageNowClient:
retry_sleep=9.0,
)
except KeyError as ke:
- print("CDX KeyError: {}".format(ke), file=sys.stderr)
+ print(" CDX KeyError: {}".format(ke), file=sys.stderr)
return ResourceResult(
start_url=start_url,
hit=False,