aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/html_ingest.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-11-08 14:28:37 -0800
committerBryan Newbold <bnewbold@archive.org>2020-11-08 14:28:37 -0800
commit6431b2f6b4bd9bd4dea4b373b89eb3f89648cc4c (patch)
treea97bdff5e392854fbb779ee179d851319426166f /python/sandcrawler/html_ingest.py
parent5d525e9744303bf5ddcf673623483d4a6a787326 (diff)
downloadsandcrawler-6431b2f6b4bd9bd4dea4b373b89eb3f89648cc4c.tar.gz
sandcrawler-6431b2f6b4bd9bd4dea4b373b89eb3f89648cc4c.zip
html: small ingest improvements
Diffstat (limited to 'python/sandcrawler/html_ingest.py')
-rw-r--r--python/sandcrawler/html_ingest.py4
1 files changed, 4 insertions, 0 deletions
diff --git a/python/sandcrawler/html_ingest.py b/python/sandcrawler/html_ingest.py
index 42bd946..a8ba0d6 100644
--- a/python/sandcrawler/html_ingest.py
+++ b/python/sandcrawler/html_ingest.py
@@ -127,6 +127,10 @@ def quick_fetch_html_resources(resources: List[dict], cdx_client: CdxApiClient,
raise NoCaptureError(f"HTML sub-resource not found: {resource['url']}")
if cdx_row.url != resource['url']:
print(f" WARN: CDX fuzzy match: {cdx_row.url} != {resource['url']}", file=sys.stderr)
+ if not cdx_row.status_code:
+ # TODO: fall back to a full fetch?
+ print(f" WARN: skipping revisit record", file=sys.stderr)
+ continue
full.append(WebResource(
surt=cdx_row.surt,
timestamp=cdx_row.datetime,