aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
Diffstat (limited to 'python')
-rw-r--r--python/sandcrawler/ia.py6
-rw-r--r--python/sandcrawler/ingest.py2
2 files changed, 4 insertions, 4 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index d91844b..4b4875d 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -151,14 +151,14 @@ class CdxApiClient:
resp = self._query_api(params)
if not resp:
if retry_sleep:
- print("CDX fetch failed; will sleep {}sec and try again".format(retry_sleep))
+ print("CDX fetch failed; will sleep {}sec and try again".format(retry_sleep), file=sys.stderr)
time.sleep(retry_sleep)
return self.fetch(url, datetime, filter_status_code=filter_status_code, retry_sleep=None)
raise KeyError("CDX url/datetime not found: {} {}".format(url, datetime))
row = resp[0]
if not (row.url == url and row.datetime == datetime):
if retry_sleep:
- print("CDX fetch failed; will sleep {}sec and try again".format(retry_sleep))
+ print("CDX fetch failed; will sleep {}sec and try again".format(retry_sleep), file=sys.stderr)
time.sleep(retry_sleep)
return self.fetch(url, datetime, filter_status_code=filter_status_code, retry_sleep=None)
raise KeyError("Didn't get exact CDX url/datetime match. url:{} dt:{} got:{}".format(url, datetime, row))
@@ -731,7 +731,7 @@ class SavePageNowClient:
retry_sleep=10.0,
)
except KeyError as ke:
- print(str(ke), file=sys.stderr)
+ print("CDX KeyError: {}".format(ke), file=sys.stderr)
return ResourceResult(
start_url=start_url,
hit=False,
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index 331df11..8c77d65 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -214,7 +214,7 @@ class IngestFileWorker(SandcrawlerWorker):
return result
next_url = fulltext_url.get('pdf_url') or fulltext_url.get('next_url')
assert next_url
- print("[EXTRACT\t] {}\t{}".format(
+ print("[PARSE\t] {}\t{}".format(
fulltext_url.get('technique'),
next_url,
),