diff options
Diffstat (limited to 'python')
-rw-r--r-- | python/sandcrawler/ia.py | 6 | ||||
-rw-r--r-- | python/sandcrawler/ingest.py | 2 |
2 files changed, 4 insertions, 4 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index d91844b..4b4875d 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -151,14 +151,14 @@ class CdxApiClient: resp = self._query_api(params) if not resp: if retry_sleep: - print("CDX fetch failed; will sleep {}sec and try again".format(retry_sleep)) + print("CDX fetch failed; will sleep {}sec and try again".format(retry_sleep), file=sys.stderr) time.sleep(retry_sleep) return self.fetch(url, datetime, filter_status_code=filter_status_code, retry_sleep=None) raise KeyError("CDX url/datetime not found: {} {}".format(url, datetime)) row = resp[0] if not (row.url == url and row.datetime == datetime): if retry_sleep: - print("CDX fetch failed; will sleep {}sec and try again".format(retry_sleep)) + print("CDX fetch failed; will sleep {}sec and try again".format(retry_sleep), file=sys.stderr) time.sleep(retry_sleep) return self.fetch(url, datetime, filter_status_code=filter_status_code, retry_sleep=None) raise KeyError("Didn't get exact CDX url/datetime match. url:{} dt:{} got:{}".format(url, datetime, row)) @@ -731,7 +731,7 @@ class SavePageNowClient: retry_sleep=10.0, ) except KeyError as ke: - print(str(ke), file=sys.stderr) + print("CDX KeyError: {}".format(ke), file=sys.stderr) return ResourceResult( start_url=start_url, hit=False, diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index 331df11..8c77d65 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -214,7 +214,7 @@ class IngestFileWorker(SandcrawlerWorker): return result next_url = fulltext_url.get('pdf_url') or fulltext_url.get('next_url') assert next_url - print("[EXTRACT\t] {}\t{}".format( + print("[PARSE\t] {}\t{}".format( fulltext_url.get('technique'), next_url, ), |