ingest: crude content-encoding support

This perhaps should be handled in IA wrapper tool directly, instead of in ingest code. Or really, possibly a bug in wayback python library or SPN?
author: Bryan Newbold <bnewbold@archive.org> 2020-03-02 20:07:52 -0800
committer: Bryan Newbold <bnewbold@archive.org> 2020-03-02 20:07:54 -0800
commit: 9911f2cad6f7470dbdb5af835ad61bcd4b7ad318 (patch)
tree: 858d159780af94e76948a94da0836e76c962c272 /python
parent: b45e1ac6638edb9d634269a343d05eff90daa31e (diff)
download: sandcrawler-9911f2cad6f7470dbdb5af835ad61bcd4b7ad318.tar.gz
sandcrawler-9911f2cad6f7470dbdb5af835ad61bcd4b7ad318.zip
1 files changed, 19 insertions, 1 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index 9a4335b..529e663 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -1,12 +1,13 @@
 
 import sys
 import json
+import gzip
 import base64
 import requests
 from http.server import BaseHTTPRequestHandler, HTTPServer
 from collections import namedtuple
 
-from sandcrawler.ia import SavePageNowClient, CdxApiClient, WaybackClient, WaybackError, SavePageNowError, CdxApiError, PetaboxError, cdx_to_dict
+from sandcrawler.ia import SavePageNowClient, CdxApiClient, WaybackClient, WaybackError, SavePageNowError, CdxApiError, PetaboxError, cdx_to_dict, ResourceResult
 from sandcrawler.grobid import GrobidClient
 from sandcrawler.misc import gen_file_metadata
 from sandcrawler.html import extract_fulltext_url
@@ -292,6 +293,23 @@ class IngestFileWorker(SandcrawlerWorker):
                 return result
             file_meta = gen_file_metadata(resource.body)
 
+            if file_meta['mimetype'] == 'application/gzip' and resource.cdx and resource.cdx.mimetype != 'application/gzip':
+                print("transfer encoding not stripped: {}".format(resource.cdx.mimetype), file=sys.stderr)
+                inner_body = gzip.decompress(resource.body)
+                resource = ResourceResult(
+                    body=inner_body,
+                    # copy all other fields
+                    start_url=resource.start_url,
+                    hit=resource.hit,
+                    status=resource.status,
+                    terminal_url=resource.terminal_url,
+                    terminal_dt=resource.terminal_dt,
+                    terminal_status_code=resource.terminal_status_code,
+                    cdx=resource.cdx,
+                    revisit_cdx=resource.revisit_cdx,
+                )
+                file_meta = gen_file_metadata(resource.body)
+
             if "html" in file_meta['mimetype'] or "xhtml" in file_meta['mimetype'] or "application/xml" in file_meta['mimetype']:
                 # Got landing page or similar. Some XHTML detected as "application/xml"
                 if resource.terminal_dt:
author	Bryan Newbold <bnewbold@archive.org>	2020-03-02 20:07:52 -0800
committer	Bryan Newbold <bnewbold@archive.org>	2020-03-02 20:07:54 -0800
commit	9911f2cad6f7470dbdb5af835ad61bcd4b7ad318 (patch)
tree	858d159780af94e76948a94da0836e76c962c272 /python
parent	b45e1ac6638edb9d634269a343d05eff90daa31e (diff)
download	sandcrawler-9911f2cad6f7470dbdb5af835ad61bcd4b7ad318.tar.gz sandcrawler-9911f2cad6f7470dbdb5af835ad61bcd4b7ad318.zip