aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/ia.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-11-03 16:16:17 -0800
committerBryan Newbold <bnewbold@archive.org>2020-11-03 16:16:17 -0800
commit0b3a9118d7aa9fc3540f8d8f7c367a4c6a856ecf (patch)
tree29f63ad1109adcf0725f5d512a4ee28b7f7ac520 /python/sandcrawler/ia.py
parent5d45a76e6c2c2ba530484c578db5e726c685eba8 (diff)
downloadsandcrawler-0b3a9118d7aa9fc3540f8d8f7c367a4c6a856ecf.tar.gz
sandcrawler-0b3a9118d7aa9fc3540f8d8f7c367a4c6a856ecf.zip
move transfer encoding helper to sandcrawler/ia.py
Diffstat (limited to 'python/sandcrawler/ia.py')
-rw-r--r--python/sandcrawler/ia.py27
1 files changed, 26 insertions, 1 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index cca81fa..a3d8249 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -3,10 +3,14 @@
# in `wayback` library. Means we can't run pylint.
# pylint: skip-file
-import os, sys, time
+import os
+import sys
+import time
+import gzip
import json
import requests
import datetime
+from typing import Tuple
from collections import namedtuple
import http.client
@@ -1064,3 +1068,24 @@ class SavePageNowClient:
revisit_cdx=revisit_cdx,
)
+
+def fix_transfer_encoding(file_meta: dict, resource: ResourceResult) -> Tuple[dict, ResourceResult]:
+ if resource.body and file_meta['mimetype'] == 'application/gzip' and resource.cdx and resource.cdx.mimetype != 'application/gzip':
+ print(" transfer encoding not stripped: {}".format(resource.cdx.mimetype), file=sys.stderr)
+ inner_body = gzip.decompress(resource.body)
+ inner_resource = ResourceResult(
+ body=inner_body,
+ # copy all other fields
+ start_url=resource.start_url,
+ hit=resource.hit,
+ status=resource.status,
+ terminal_url=resource.terminal_url,
+ terminal_dt=resource.terminal_dt,
+ terminal_status_code=resource.terminal_status_code,
+ cdx=resource.cdx,
+ revisit_cdx=resource.revisit_cdx,
+ )
+ inner_file_meta = gen_file_metadata(inner_resource.body)
+ return (inner_file_meta, inner_resource)
+ else:
+ return (file_meta, resource)