diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-11-08 14:28:37 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-11-08 14:28:37 -0800 |
commit | 6431b2f6b4bd9bd4dea4b373b89eb3f89648cc4c (patch) | |
tree | a97bdff5e392854fbb779ee179d851319426166f /python/sandcrawler/html_ingest.py | |
parent | 5d525e9744303bf5ddcf673623483d4a6a787326 (diff) | |
download | sandcrawler-6431b2f6b4bd9bd4dea4b373b89eb3f89648cc4c.tar.gz sandcrawler-6431b2f6b4bd9bd4dea4b373b89eb3f89648cc4c.zip |
html: small ingest improvements
Diffstat (limited to 'python/sandcrawler/html_ingest.py')
-rw-r--r-- | python/sandcrawler/html_ingest.py | 4 |
1 files changed, 4 insertions, 0 deletions
diff --git a/python/sandcrawler/html_ingest.py b/python/sandcrawler/html_ingest.py index 42bd946..a8ba0d6 100644 --- a/python/sandcrawler/html_ingest.py +++ b/python/sandcrawler/html_ingest.py @@ -127,6 +127,10 @@ def quick_fetch_html_resources(resources: List[dict], cdx_client: CdxApiClient, raise NoCaptureError(f"HTML sub-resource not found: {resource['url']}") if cdx_row.url != resource['url']: print(f" WARN: CDX fuzzy match: {cdx_row.url} != {resource['url']}", file=sys.stderr) + if not cdx_row.status_code: + # TODO: fall back to a full fetch? + print(f" WARN: skipping revisit record", file=sys.stderr) + continue full.append(WebResource( surt=cdx_row.surt, timestamp=cdx_row.datetime, |