aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-11-03 16:16:17 -0800
committerBryan Newbold <bnewbold@archive.org>2020-11-03 16:16:17 -0800
commit0b3a9118d7aa9fc3540f8d8f7c367a4c6a856ecf (patch)
tree29f63ad1109adcf0725f5d512a4ee28b7f7ac520 /python
parent5d45a76e6c2c2ba530484c578db5e726c685eba8 (diff)
downloadsandcrawler-0b3a9118d7aa9fc3540f8d8f7c367a4c6a856ecf.tar.gz
sandcrawler-0b3a9118d7aa9fc3540f8d8f7c367a4c6a856ecf.zip
move transfer encoding helper to sandcrawler/ia.py
Diffstat (limited to 'python')
-rw-r--r--python/sandcrawler/html_ingest.py38
-rw-r--r--python/sandcrawler/ia.py27
2 files changed, 42 insertions, 23 deletions
diff --git a/python/sandcrawler/html_ingest.py b/python/sandcrawler/html_ingest.py
index 284461e..823218b 100644
--- a/python/sandcrawler/html_ingest.py
+++ b/python/sandcrawler/html_ingest.py
@@ -1,7 +1,6 @@
import io
import sys
-import gzip
import json
import datetime
import argparse
@@ -12,7 +11,7 @@ import trafilatura
import pydantic
from selectolax.parser import HTMLParser
-from sandcrawler.ia import WaybackClient, CdxApiClient, ResourceResult, cdx_to_dict
+from sandcrawler.ia import WaybackClient, CdxApiClient, ResourceResult, cdx_to_dict, fix_transfer_encoding
from sandcrawler.misc import gen_file_metadata, parse_cdx_datetime, datetime_to_cdx
from sandcrawler.html_metadata import BiblioMetadata, html_extract_resources, html_extract_biblio, load_adblock_rules
@@ -74,27 +73,22 @@ class IngestWebResult(pydantic.BaseModel):
datetime.datetime: lambda dt: dt.isoformat(),
}
-
-def fix_transfer_encoding(file_meta: dict, resource: ResourceResult) -> Tuple[dict, ResourceResult]:
- if file_meta['mimetype'] == 'application/gzip' and resource.cdx and resource.cdx.mimetype != 'application/gzip':
- print("transfer encoding not stripped: {}".format(resource.cdx.mimetype), file=sys.stderr)
- inner_body = gzip.decompress(resource.body)
- inner_resource = ResourceResult(
- body=inner_body,
- # copy all other fields
- start_url=resource.start_url,
- hit=resource.hit,
- status=resource.status,
- terminal_url=resource.terminal_url,
- terminal_dt=resource.terminal_dt,
- terminal_status_code=resource.terminal_status_code,
- cdx=resource.cdx,
- revisit_cdx=resource.revisit_cdx,
+ def to_sql_tuple(self) -> Tuple:
+ """
+ This is for the html_meta SQL table.
+ """
+ assert self.file_meta and "sha1hex" in self.file_meta
+ return (
+ self.file_meta["sha1hex"],
+ datetime.datetime.now(), # updated
+ self.status,
+ self.scope,
+ bool(self.html_body and self.html_body['status'] == 'success' and self.html_body['tei_xml']),
+ False, # has_thumbnail
+ (self.html_body and self.html_body.get('word_count')) or None,
+ self.html_biblio,
+ self.html_resources,
)
- inner_file_meta = gen_file_metadata(inner_resource.body)
- return (inner_file_meta, inner_resource)
- else:
- return (file_meta, resource)
def quick_fetch_html_resources(resources: List[dict], cdx_client: CdxApiClient, when: Optional[datetime.datetime]) -> List[WebResource]:
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index cca81fa..a3d8249 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -3,10 +3,14 @@
# in `wayback` library. Means we can't run pylint.
# pylint: skip-file
-import os, sys, time
+import os
+import sys
+import time
+import gzip
import json
import requests
import datetime
+from typing import Tuple
from collections import namedtuple
import http.client
@@ -1064,3 +1068,24 @@ class SavePageNowClient:
revisit_cdx=revisit_cdx,
)
+
+def fix_transfer_encoding(file_meta: dict, resource: ResourceResult) -> Tuple[dict, ResourceResult]:
+ if resource.body and file_meta['mimetype'] == 'application/gzip' and resource.cdx and resource.cdx.mimetype != 'application/gzip':
+ print(" transfer encoding not stripped: {}".format(resource.cdx.mimetype), file=sys.stderr)
+ inner_body = gzip.decompress(resource.body)
+ inner_resource = ResourceResult(
+ body=inner_body,
+ # copy all other fields
+ start_url=resource.start_url,
+ hit=resource.hit,
+ status=resource.status,
+ terminal_url=resource.terminal_url,
+ terminal_dt=resource.terminal_dt,
+ terminal_status_code=resource.terminal_status_code,
+ cdx=resource.cdx,
+ revisit_cdx=resource.revisit_cdx,
+ )
+ inner_file_meta = gen_file_metadata(inner_resource.body)
+ return (inner_file_meta, inner_resource)
+ else:
+ return (file_meta, resource)