diff options
| author | Bryan Newbold <bnewbold@archive.org> | 2020-10-19 16:30:20 -0700 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@archive.org> | 2020-10-19 16:30:20 -0700 | 
| commit | e5c7645010ed1315a43f9cc0cd20ca192b5e8008 (patch) | |
| tree | cf50c444d70010154c97db5714f29d0db4b7657a | |
| parent | b388be5aff1b074b82a5382c5267a8ab4c9e615b (diff) | |
| download | sandcrawler-e5c7645010ed1315a43f9cc0cd20ca192b5e8008.tar.gz sandcrawler-e5c7645010ed1315a43f9cc0cd20ca192b5e8008.zip  | |
SPN: more verbose status logging
| -rw-r--r-- | python/sandcrawler/ia.py | 4 | 
1 files changed, 4 insertions, 0 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index 30ebc77..2d0d068 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -4,6 +4,7 @@  # pylint: skip-file  import os, sys, time +import json  import requests  import datetime  from collections import namedtuple @@ -861,6 +862,7 @@ class SavePageNowClient:          # if there was a recent crawl of same URL, fetch the status of that          # crawl to get correct datetime          if final_json.get('original_job_id'): +            print(f"  SPN recent capture: {job_id} -> {final_json['original_job_id']}", file=sys.stderr)              resp = self.v2_session.get("{}/status/{}".format(self.v2endpoint, final_json['original_job_id']))              try:                  resp.raise_for_status() @@ -871,6 +873,8 @@ class SavePageNowClient:          #print(final_json, file=sys.stderr)          if final_json['status'] == "success": +            if final_json.get('original_url').startswith('/'): +                print(f"  truncateded URL in JSON: {request_url} {json.dumps(final_json)}", file=sys.stderr)              return SavePageNowResult(                  True,                  "success",  | 
