aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/ingest_html.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/sandcrawler/ingest_html.py')
-rw-r--r--python/sandcrawler/ingest_html.py18
1 files changed, 15 insertions, 3 deletions
diff --git a/python/sandcrawler/ingest_html.py b/python/sandcrawler/ingest_html.py
index 0ff7fe0..fb42e71 100644
--- a/python/sandcrawler/ingest_html.py
+++ b/python/sandcrawler/ingest_html.py
@@ -196,10 +196,20 @@ def fetch_html_resources(
wayback_resp = wayback_client.lookup_resource(resource["url"], closest=closest)
if not wayback_resp or wayback_resp.status != "success":
raise NoCaptureError(f"HTML sub-resource not found: {resource['url']}")
- file_meta = gen_file_metadata(wayback_resp.body, allow_empty=True)
- if file_meta["sha1hex"] != wayback_resp.cdx.sha1hex:
+ # for HTML sub-resources specifically, we allow the CDX SHA1 to match
+ # either the transfer-encoded or inner (un-encoded) payload body to
+ # match. This is because of an ambiguity in the WARC specification
+ outer_file_meta = gen_file_metadata(wayback_resp.body, allow_empty=True)
+ try:
+ file_meta, wayback_resp = fix_transfer_encoding(outer_file_meta, wayback_resp)
+ except Exception as e:
+ raise WaybackContentError(f"bad gzip encoding: {e}")
+ if (
+ file_meta["sha1hex"] != wayback_resp.cdx.sha1hex
+ and outer_file_meta["sha1hex"] != wayback_resp.cdx.sha1hex
+ ):
raise WaybackContentError(
- f"wayback payload sha1hex mismatch: {wayback_resp.cdx.datetime} {wayback_resp.cdx.url}"
+ f"wayback payload sha1hex mismatch: {wayback_resp.cdx.datetime} {wayback_resp.cdx.url} found:{file_meta['sha1hex']} expected:{wayback_resp.cdx.sha1hex}"
)
full.append(
WebResource(
@@ -250,6 +260,8 @@ def html_guess_platform(
in doc.html
):
return "ojs"
+ if '<a href="https://www.pubpub.org">Published with' in doc.html:
+ return "pubpub"
if 'Powered by <a target="_blank" href="http://arphahub.com">' in doc.html:
return "arpha"
if "<meta property='og:image' content='http://cms.galenos.com.tr' />" in doc.html: