diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-11-03 16:16:17 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-11-03 16:16:17 -0800 |
commit | 0b3a9118d7aa9fc3540f8d8f7c367a4c6a856ecf (patch) | |
tree | 29f63ad1109adcf0725f5d512a4ee28b7f7ac520 /python/sandcrawler/ia.py | |
parent | 5d45a76e6c2c2ba530484c578db5e726c685eba8 (diff) | |
download | sandcrawler-0b3a9118d7aa9fc3540f8d8f7c367a4c6a856ecf.tar.gz sandcrawler-0b3a9118d7aa9fc3540f8d8f7c367a4c6a856ecf.zip |
move transfer encoding helper to sandcrawler/ia.py
Diffstat (limited to 'python/sandcrawler/ia.py')
-rw-r--r-- | python/sandcrawler/ia.py | 27 |
1 files changed, 26 insertions, 1 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index cca81fa..a3d8249 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -3,10 +3,14 @@ # in `wayback` library. Means we can't run pylint. # pylint: skip-file -import os, sys, time +import os +import sys +import time +import gzip import json import requests import datetime +from typing import Tuple from collections import namedtuple import http.client @@ -1064,3 +1068,24 @@ class SavePageNowClient: revisit_cdx=revisit_cdx, ) + +def fix_transfer_encoding(file_meta: dict, resource: ResourceResult) -> Tuple[dict, ResourceResult]: + if resource.body and file_meta['mimetype'] == 'application/gzip' and resource.cdx and resource.cdx.mimetype != 'application/gzip': + print(" transfer encoding not stripped: {}".format(resource.cdx.mimetype), file=sys.stderr) + inner_body = gzip.decompress(resource.body) + inner_resource = ResourceResult( + body=inner_body, + # copy all other fields + start_url=resource.start_url, + hit=resource.hit, + status=resource.status, + terminal_url=resource.terminal_url, + terminal_dt=resource.terminal_dt, + terminal_status_code=resource.terminal_status_code, + cdx=resource.cdx, + revisit_cdx=resource.revisit_cdx, + ) + inner_file_meta = gen_file_metadata(inner_resource.body) + return (inner_file_meta, inner_resource) + else: + return (file_meta, resource) |