diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-10-29 15:28:32 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-10-29 15:28:32 -0700 |
commit | 24bfdfaa260156e395c509f0c18657e79dc6f730 (patch) | |
tree | 07767aa08bfda571daf7e3696adb0557c2cfd02c /python | |
parent | 8e9106885bc736648c0bf0151a29d4bea9b72650 (diff) | |
download | sandcrawler-24bfdfaa260156e395c509f0c18657e79dc6f730.tar.gz sandcrawler-24bfdfaa260156e395c509f0c18657e79dc6f730.zip |
html ingest: improve data flow
Diffstat (limited to 'python')
-rw-r--r-- | python/sandcrawler/html_ingest.py | 59 |
1 files changed, 41 insertions, 18 deletions
diff --git a/python/sandcrawler/html_ingest.py b/python/sandcrawler/html_ingest.py index 10662a1..f28231e 100644 --- a/python/sandcrawler/html_ingest.py +++ b/python/sandcrawler/html_ingest.py @@ -36,6 +36,25 @@ class WebResource(pydantic.BaseModel): sha256hex: Optional[str] resource_type: Optional[str] + class Config: + json_encoders = { + datetime.datetime: lambda dt: dt.isoformat() + } + +class IngestWebResult(pydantic.BaseModel): + status: str + request: Optional[Any] # TODO + html_resource: Optional[ResourceResult] + file_meta: Optional[dict] + html_fulltext: Optional[dict] + html_meta: Optional[BiblioMetadata] + subresources: Optional[List[WebResource]] + + class Config: + json_encoders = { + datetime.datetime: lambda dt: dt.isoformat() + } + def quick_fetch_html_resources(resources: List[dict], cdx_client: CdxApiClient, when: Optional[datetime.datetime]) -> List[WebResource]: """ @@ -82,13 +101,11 @@ def fetch_html_resources(resources: List[dict], wayback_client: WaybackClient, w wayback_resp = wayback_client.lookup_resource(resource['url']) if not wayback_resp: raise Exception("wayback lookup failed") + # XXX assert wayback_resp.status == 'success' - if wayback_resp.cdx.url != resource['url']: - pass - #raise Exception( - # f"CDX lookup URL mismatch: {cdx_row.url} != {resource['url']}") file_meta = gen_file_metadata(wayback_resp.body) - assert file_meta['sha1hex'] == wayback_resp.cdx.sha1hex + if file_meta['sha1hex'] != wayback_resp.cdx.sha1hex: + raise Exception("wayback payload sha1hex mismatch") full.append(WebResource( surt=wayback_resp.cdx.surt, timestamp=wayback_resp.cdx.datetime, @@ -104,19 +121,26 @@ def fetch_html_resources(resources: List[dict], wayback_client: WaybackClient, w return full -def run_single(url: str, timestamp: Optional[str] = None, quick_mode: bool = False) -> None: +def run_single(url: str, timestamp: Optional[str] = None, quick_mode: bool = False) -> IngestWebResult: adblock = load_adblock_rules() wayback_client = WaybackClient() html_resource = wayback_client.lookup_resource(url, "text/html") if html_resource.status != "success": - print(json.dumps(html_resource, indent=2)) - return + return IngestWebResult( + status=html_resource.status, + html_resource=html_resource, + ) file_meta = gen_file_metadata(html_resource.body) - # XXX: - assert file_meta['mimetype'] == "text/html" + + if file_meta['mimetype'] != "text/html": + return IngestWebResult( + status="wrong-mimetype", + html_resource=html_resource, + file_meta=file_meta, + ) html_doc = HTMLParser(html_resource.body) html_meta = html_extract_biblio(html_doc) @@ -132,17 +156,15 @@ def run_single(url: str, timestamp: Optional[str] = None, quick_mode: bool = Fal else: full_resources = fetch_html_resources(raw_resources, wayback_client, when) - output = dict( + output = IngestWebResult( status="success", - #html_resource=html_resource, + html_resource=html_resource, file_meta=file_meta, html_fulltext=html_fulltext, - # XXX: - html_meta=html_meta and html_meta.dict(exclude_none=True, exclude={'release_date'}), - resources=[r.dict(exclude_none=True, exclude={'timestamp'}) for r in full_resources], + html_meta=html_meta, + subresources=full_resources, ) - - print(json.dumps(output, indent=2)) + return output def main() -> None: @@ -183,7 +205,8 @@ def main() -> None: sys.exit(-1) if args.func == "run_single": - run_single(args.url, args.timestamp, args.quick_mode) + result = run_single(args.url, args.timestamp, args.quick_mode) + print(result.json(indent=2)) else: #func = getattr(wp, args.func) #func() |