aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/html_ingest.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-10-29 15:28:32 -0700
committerBryan Newbold <bnewbold@archive.org>2020-10-29 15:28:32 -0700
commit24bfdfaa260156e395c509f0c18657e79dc6f730 (patch)
tree07767aa08bfda571daf7e3696adb0557c2cfd02c /python/sandcrawler/html_ingest.py
parent8e9106885bc736648c0bf0151a29d4bea9b72650 (diff)
downloadsandcrawler-24bfdfaa260156e395c509f0c18657e79dc6f730.tar.gz
sandcrawler-24bfdfaa260156e395c509f0c18657e79dc6f730.zip
html ingest: improve data flow
Diffstat (limited to 'python/sandcrawler/html_ingest.py')
-rw-r--r--python/sandcrawler/html_ingest.py59
1 files changed, 41 insertions, 18 deletions
diff --git a/python/sandcrawler/html_ingest.py b/python/sandcrawler/html_ingest.py
index 10662a1..f28231e 100644
--- a/python/sandcrawler/html_ingest.py
+++ b/python/sandcrawler/html_ingest.py
@@ -36,6 +36,25 @@ class WebResource(pydantic.BaseModel):
sha256hex: Optional[str]
resource_type: Optional[str]
+ class Config:
+ json_encoders = {
+ datetime.datetime: lambda dt: dt.isoformat()
+ }
+
+class IngestWebResult(pydantic.BaseModel):
+ status: str
+ request: Optional[Any] # TODO
+ html_resource: Optional[ResourceResult]
+ file_meta: Optional[dict]
+ html_fulltext: Optional[dict]
+ html_meta: Optional[BiblioMetadata]
+ subresources: Optional[List[WebResource]]
+
+ class Config:
+ json_encoders = {
+ datetime.datetime: lambda dt: dt.isoformat()
+ }
+
def quick_fetch_html_resources(resources: List[dict], cdx_client: CdxApiClient, when: Optional[datetime.datetime]) -> List[WebResource]:
"""
@@ -82,13 +101,11 @@ def fetch_html_resources(resources: List[dict], wayback_client: WaybackClient, w
wayback_resp = wayback_client.lookup_resource(resource['url'])
if not wayback_resp:
raise Exception("wayback lookup failed")
+ # XXX
assert wayback_resp.status == 'success'
- if wayback_resp.cdx.url != resource['url']:
- pass
- #raise Exception(
- # f"CDX lookup URL mismatch: {cdx_row.url} != {resource['url']}")
file_meta = gen_file_metadata(wayback_resp.body)
- assert file_meta['sha1hex'] == wayback_resp.cdx.sha1hex
+ if file_meta['sha1hex'] != wayback_resp.cdx.sha1hex:
+ raise Exception("wayback payload sha1hex mismatch")
full.append(WebResource(
surt=wayback_resp.cdx.surt,
timestamp=wayback_resp.cdx.datetime,
@@ -104,19 +121,26 @@ def fetch_html_resources(resources: List[dict], wayback_client: WaybackClient, w
return full
-def run_single(url: str, timestamp: Optional[str] = None, quick_mode: bool = False) -> None:
+def run_single(url: str, timestamp: Optional[str] = None, quick_mode: bool = False) -> IngestWebResult:
adblock = load_adblock_rules()
wayback_client = WaybackClient()
html_resource = wayback_client.lookup_resource(url, "text/html")
if html_resource.status != "success":
- print(json.dumps(html_resource, indent=2))
- return
+ return IngestWebResult(
+ status=html_resource.status,
+ html_resource=html_resource,
+ )
file_meta = gen_file_metadata(html_resource.body)
- # XXX:
- assert file_meta['mimetype'] == "text/html"
+
+ if file_meta['mimetype'] != "text/html":
+ return IngestWebResult(
+ status="wrong-mimetype",
+ html_resource=html_resource,
+ file_meta=file_meta,
+ )
html_doc = HTMLParser(html_resource.body)
html_meta = html_extract_biblio(html_doc)
@@ -132,17 +156,15 @@ def run_single(url: str, timestamp: Optional[str] = None, quick_mode: bool = Fal
else:
full_resources = fetch_html_resources(raw_resources, wayback_client, when)
- output = dict(
+ output = IngestWebResult(
status="success",
- #html_resource=html_resource,
+ html_resource=html_resource,
file_meta=file_meta,
html_fulltext=html_fulltext,
- # XXX:
- html_meta=html_meta and html_meta.dict(exclude_none=True, exclude={'release_date'}),
- resources=[r.dict(exclude_none=True, exclude={'timestamp'}) for r in full_resources],
+ html_meta=html_meta,
+ subresources=full_resources,
)
-
- print(json.dumps(output, indent=2))
+ return output
def main() -> None:
@@ -183,7 +205,8 @@ def main() -> None:
sys.exit(-1)
if args.func == "run_single":
- run_single(args.url, args.timestamp, args.quick_mode)
+ result = run_single(args.url, args.timestamp, args.quick_mode)
+ print(result.json(indent=2))
else:
#func = getattr(wp, args.func)
#func()