diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-03-02 20:07:52 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-03-02 20:07:54 -0800 |
commit | 9911f2cad6f7470dbdb5af835ad61bcd4b7ad318 (patch) | |
tree | 858d159780af94e76948a94da0836e76c962c272 /python | |
parent | b45e1ac6638edb9d634269a343d05eff90daa31e (diff) | |
download | sandcrawler-9911f2cad6f7470dbdb5af835ad61bcd4b7ad318.tar.gz sandcrawler-9911f2cad6f7470dbdb5af835ad61bcd4b7ad318.zip |
ingest: crude content-encoding support
This perhaps should be handled in IA wrapper tool directly, instead of
in ingest code. Or really, possibly a bug in wayback python library or
SPN?
Diffstat (limited to 'python')
-rw-r--r-- | python/sandcrawler/ingest.py | 20 |
1 files changed, 19 insertions, 1 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index 9a4335b..529e663 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -1,12 +1,13 @@ import sys import json +import gzip import base64 import requests from http.server import BaseHTTPRequestHandler, HTTPServer from collections import namedtuple -from sandcrawler.ia import SavePageNowClient, CdxApiClient, WaybackClient, WaybackError, SavePageNowError, CdxApiError, PetaboxError, cdx_to_dict +from sandcrawler.ia import SavePageNowClient, CdxApiClient, WaybackClient, WaybackError, SavePageNowError, CdxApiError, PetaboxError, cdx_to_dict, ResourceResult from sandcrawler.grobid import GrobidClient from sandcrawler.misc import gen_file_metadata from sandcrawler.html import extract_fulltext_url @@ -292,6 +293,23 @@ class IngestFileWorker(SandcrawlerWorker): return result file_meta = gen_file_metadata(resource.body) + if file_meta['mimetype'] == 'application/gzip' and resource.cdx and resource.cdx.mimetype != 'application/gzip': + print("transfer encoding not stripped: {}".format(resource.cdx.mimetype), file=sys.stderr) + inner_body = gzip.decompress(resource.body) + resource = ResourceResult( + body=inner_body, + # copy all other fields + start_url=resource.start_url, + hit=resource.hit, + status=resource.status, + terminal_url=resource.terminal_url, + terminal_dt=resource.terminal_dt, + terminal_status_code=resource.terminal_status_code, + cdx=resource.cdx, + revisit_cdx=resource.revisit_cdx, + ) + file_meta = gen_file_metadata(resource.body) + if "html" in file_meta['mimetype'] or "xhtml" in file_meta['mimetype'] or "application/xml" in file_meta['mimetype']: # Got landing page or similar. Some XHTML detected as "application/xml" if resource.terminal_dt: |