diff options
-rw-r--r-- | python/sandcrawler/ia.py | 15 | ||||
-rw-r--r-- | python/sandcrawler/ingest.py | 6 |
2 files changed, 10 insertions, 11 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index b1f90ea..cca81fa 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -363,7 +363,7 @@ class WaybackClient: #print("offset: {} csize: {} uri: {}".format(offset, csize, warc_uri), file=sys.stderr) gwb_record = self.rstore.load_resource(warc_uri, offset, csize) except wayback.exception.ResourceUnavailable: - print("Failed to fetch from warc_path:{}".format(warc_path), file=sys.stderr) + print(" Failed to fetch from warc_path:{}".format(warc_path), file=sys.stderr) raise PetaboxError("failed to load file contents from wayback/petabox (ResourceUnavailable)") except ValueError as ve: raise PetaboxError("failed to load file contents from wayback/petabox (ValueError: {})".format(ve)) @@ -511,7 +511,7 @@ class WaybackClient: # TODO: don't need *all* these hashes, just sha1 file_meta = gen_file_metadata(resp.content) if cdx_sha1hex != file_meta['sha1hex']: - print("REPLAY MISMATCH: cdx:{} replay:{}".format( + print(" REPLAY MISMATCH: cdx:{} replay:{}".format( cdx_sha1hex, file_meta['sha1hex']), file=sys.stderr) @@ -672,7 +672,7 @@ class WaybackClient: ) assert 300 <= resource.status_code < 400 if not resource.location: - print("bad redirect record: {}".format(cdx_row), file=sys.stderr) + print(" bad redirect record: {}".format(cdx_row), file=sys.stderr) return ResourceResult( start_url=start_url, hit=False, @@ -701,7 +701,7 @@ class WaybackClient: next_url = clean_url(next_url) cdx_row = cdx_partial_from_row(cdx_row) if not next_url: - print("bad redirect record: {}".format(cdx_row), file=sys.stderr) + print(" bad redirect record: {}".format(cdx_row), file=sys.stderr) return ResourceResult( start_url=start_url, hit=False, @@ -982,13 +982,12 @@ class SavePageNowClient: elsevier_pdf_cdx = wayback_client.cdx_client.lookup_best( spn_result.request_url, best_mimetype="application/pdf", - closest=closest, ) if elsevier_pdf_cdx and elsevier_pdf_cdx.mimetype == "application/pdf": - print("Trying pdf.sciencedirectassets.com hack!", file=sys.stderr) + print(" Trying pdf.sciencedirectassets.com hack!", file=sys.stderr) cdx_row = elsevier_pdf_cdx else: - print("Failed pdf.sciencedirectassets.com hack!", file=sys.stderr) + print(" Failed pdf.sciencedirectassets.com hack!", file=sys.stderr) #print(elsevier_pdf_cdx, file=sys.stderr) if not cdx_row: @@ -1004,7 +1003,7 @@ class SavePageNowClient: retry_sleep=9.0, ) except KeyError as ke: - print("CDX KeyError: {}".format(ke), file=sys.stderr) + print(" CDX KeyError: {}".format(ke), file=sys.stderr) return ResourceResult( start_url=start_url, hit=False, diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index 35b37fc..a39d9ea 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -193,7 +193,7 @@ class IngestFileWorker(SandcrawlerWorker): force_simple_get = 1 break resource = self.spn_client.crawl_resource(url, self.wayback_client, force_simple_get=force_simple_get) - print("[FETCH {}\t] {}\t{}".format( + print("[FETCH {:>6}] {} {}".format( via, resource.status, resource.terminal_url or url), @@ -331,10 +331,10 @@ class IngestFileWorker(SandcrawlerWorker): for block in self.base_url_blocklist: if block in base_url: - print("[SKIP {}\t] {}".format(ingest_type, base_url), file=sys.stderr) + print("[SKIP {:>6}] {}".format(ingest_type, base_url), file=sys.stderr) return dict(request=request, hit=False, status="skip-url-blocklist") - print("[INGEST {}\t] {}".format(ingest_type, base_url), file=sys.stderr) + print("[INGEST {:>6}] {}".format(ingest_type, base_url), file=sys.stderr) best_mimetype = None if ingest_type == "pdf": |