1 files changed, 71 insertions, 49 deletions
diff --git a/python/sandcrawler/ingest_html.py b/python/sandcrawler/ingest_html.py
index 9c72dd5..bf25d5d 100644
--- a/python/sandcrawler/ingest_html.py
+++ b/python/sandcrawler/ingest_html.py
@@ -1,4 +1,3 @@
-
 import argparse
 import datetime
 import io
@@ -11,16 +10,20 @@ import pydantic
 import trafilatura
 from selectolax.parser import HTMLParser
 
-from sandcrawler.html_metadata import BiblioMetadata, html_extract_biblio, html_extract_resources, load_adblock_rules
-from sandcrawler.ia import (CdxApiClient, NoCaptureError, ResourceResult, WaybackClient, WaybackContentError,
-                            cdx_to_dict, fix_transfer_encoding)
-from sandcrawler.misc import clean_url, datetime_to_cdx, gen_file_metadata, parse_cdx_datetime, url_fuzzy_equal
+from sandcrawler.html_metadata import (BiblioMetadata, html_extract_biblio,
+                                       html_extract_resources, load_adblock_rules)
+from sandcrawler.ia import (CdxApiClient, NoCaptureError, ResourceResult, WaybackClient,
+                            WaybackContentError, cdx_to_dict, fix_transfer_encoding)
+from sandcrawler.misc import (clean_url, datetime_to_cdx, gen_file_metadata, parse_cdx_datetime,
+                              url_fuzzy_equal)
 
 TRAFILATURA_AGENT = f"trafilatura/{trafilatura.__version__}"
 
+
 def html_extract_body_teixml(doc: bytes) -> dict:
     try:
-        tei_xml = trafilatura.extract(doc,
+        tei_xml = trafilatura.extract(
+            doc,
             output_format='xmltei',
             include_comments=False,
             include_formatting=True,
@@ -33,13 +36,19 @@ def html_extract_body_teixml(doc: bytes) -> dict:
     if tei_xml:
         body_txt = teixml_body_text(tei_xml)
         word_count = len(body_txt.split())
-        return dict(status="success", agent=TRAFILATURA_AGENT, tei_xml=tei_xml, word_count=word_count)
-    elif doc.startswith(b'<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" 2012"http://www.w3.org/TR/html4/loose.dtd">'):
+        return dict(status="success",
+                    agent=TRAFILATURA_AGENT,
+                    tei_xml=tei_xml,
+                    word_count=word_count)
+    elif doc.startswith(
+            b'<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" 2012"http://www.w3.org/TR/html4/loose.dtd">'
+    ):
         # hack for firstmonday.org
         return html_extract_body_teixml(doc[106:])
     else:
         return dict(status="empty-xml", agent=TRAFILATURA_AGENT)
 
+
 def teixml_body_text(doc_xml: str) -> str:
     ns = {"tei": "http://www.tei-c.org/ns/1.0"}
     tree = ET.fromstring(doc_xml)
@@ -49,6 +58,7 @@ def teixml_body_text(doc_xml: str) -> str:
     else:
         return ""
 
+
 class WebResource(pydantic.BaseModel):
     surt: str
     timestamp: datetime.datetime
@@ -61,16 +71,15 @@ class WebResource(pydantic.BaseModel):
     resource_type: Optional[str]
 
     class Config:
-        json_encoders = {
-            datetime.datetime: lambda dt: dt.isoformat()
-        }
+        json_encoders = {datetime.datetime: lambda dt: dt.isoformat()}
+
 
 class IngestWebResult(pydantic.BaseModel):
     status: str
     hit: bool
     error_message: Optional[str]
     cdx: Optional[dict]
-    terminal: Optional[Any] # TODO
+    terminal: Optional[Any]  # TODO
     request: Optional[Any]  # TODO
     file_meta: Optional[dict]
     html_biblio: Optional[BiblioMetadata]
@@ -84,6 +93,7 @@ class IngestWebResult(pydantic.BaseModel):
             datetime.datetime: lambda dt: dt.isoformat(),
         }
 
+
 class HtmlMetaRow(pydantic.BaseModel):
     sha1hex: str
     status: str
@@ -106,7 +116,7 @@ class HtmlMetaRow(pydantic.BaseModel):
         """
         return (
             self.sha1hex,
-            datetime.datetime.now(), # updated
+            datetime.datetime.now(),  # updated
             self.status,
             self.scope,
             self.has_teixml,
@@ -117,7 +127,8 @@ class HtmlMetaRow(pydantic.BaseModel):
         )
 
 
-def quick_fetch_html_resources(resources: List[dict], cdx_client: CdxApiClient, when: Optional[datetime.datetime]) -> List[WebResource]:
+def quick_fetch_html_resources(resources: List[dict], cdx_client: CdxApiClient,
+                               when: Optional[datetime.datetime]) -> List[WebResource]:
     """
     This is the lazy version that just does a CDX lookup for each resource.
 
@@ -132,27 +143,30 @@ def quick_fetch_html_resources(resources: List[dict], cdx_client: CdxApiClient,
         if not cdx_row:
             raise NoCaptureError(f"HTML sub-resource not found: {resource['url']}")
         if cdx_row.url != resource['url'] and not url_fuzzy_equal(cdx_row.url, resource['url']):
-            print(f"  WARN: CDX fuzzy match: {cdx_row.url} != {resource['url']}", file=sys.stderr)
+            print(f"  WARN: CDX fuzzy match: {cdx_row.url} != {resource['url']}",
+                  file=sys.stderr)
         if not cdx_row.status_code:
             # TODO: fall back to a full fetch?
             print(f"  WARN: skipping revisit record", file=sys.stderr)
             continue
-        full.append(WebResource(
-            surt=cdx_row.surt,
-            timestamp=cdx_row.datetime,
-            url=cdx_row.url,
-            sha1hex=cdx_row.sha1hex,
-            mimetype=cdx_row.mimetype,
-            status_code=cdx_row.status_code,
-            size=None,
-            sha256hex=None,
-            resource_type=resource['type'],
-        ))
+        full.append(
+            WebResource(
+                surt=cdx_row.surt,
+                timestamp=cdx_row.datetime,
+                url=cdx_row.url,
+                sha1hex=cdx_row.sha1hex,
+                mimetype=cdx_row.mimetype,
+                status_code=cdx_row.status_code,
+                size=None,
+                sha256hex=None,
+                resource_type=resource['type'],
+            ))
 
     return full
 
 
-def fetch_html_resources(resources: List[dict], wayback_client: WaybackClient, when: Optional[datetime.datetime]) -> List[WebResource]:
+def fetch_html_resources(resources: List[dict], wayback_client: WaybackClient,
+                         when: Optional[datetime.datetime]) -> List[WebResource]:
     """
     This is the full version which fetches each resource from wayback/petabox
     and calculates additional hashes.
@@ -168,23 +182,28 @@ def fetch_html_resources(resources: List[dict], wayback_client: WaybackClient, w
             raise NoCaptureError(f"HTML sub-resource not found: {resource['url']}")
         file_meta = gen_file_metadata(wayback_resp.body, allow_empty=True)
         if file_meta['sha1hex'] != wayback_resp.cdx.sha1hex:
-            raise WaybackContentError(f"wayback payload sha1hex mismatch: {wayback_resp.cdx.datetime} {wayback_resp.cdx.url}")
-        full.append(WebResource(
-            surt=wayback_resp.cdx.surt,
-            timestamp=parse_cdx_datetime(wayback_resp.cdx.datetime),
-            url=wayback_resp.cdx.url,
-            sha1hex=file_meta['sha1hex'],
-            mimetype=file_meta['mimetype'],
-            status_code=wayback_resp.cdx.status_code or wayback_resp.revisit_cdx.status_code,
-            size=file_meta['size_bytes'],
-            sha256hex=file_meta['sha256hex'],
-            resource_type=resource['type'],
-        ))
+            raise WaybackContentError(
+                f"wayback payload sha1hex mismatch: {wayback_resp.cdx.datetime} {wayback_resp.cdx.url}"
+            )
+        full.append(
+            WebResource(
+                surt=wayback_resp.cdx.surt,
+                timestamp=parse_cdx_datetime(wayback_resp.cdx.datetime),
+                url=wayback_resp.cdx.url,
+                sha1hex=file_meta['sha1hex'],
+                mimetype=file_meta['mimetype'],
+                status_code=wayback_resp.cdx.status_code
+                or wayback_resp.revisit_cdx.status_code,
+                size=file_meta['size_bytes'],
+                sha256hex=file_meta['sha256hex'],
+                resource_type=resource['type'],
+            ))
 
     return full
 
 
-def html_guess_platform(url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata]) -> Optional[str]:
+def html_guess_platform(url: str, doc: HTMLParser,
+                        biblio: Optional[BiblioMetadata]) -> Optional[str]:
 
     generator: Optional[str] = None
     generator_elem = doc.css_first("meta[name='generator']")
@@ -229,7 +248,9 @@ def html_guess_platform(url: str, doc: HTMLParser, biblio: Optional[BiblioMetada
 
     return None
 
-def html_guess_scope(url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata], word_count: Optional[int]) -> str:
+
+def html_guess_scope(url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata],
+                     word_count: Optional[int]) -> str:
     """
     This function tries to guess if an HTML document represents one of:
 
@@ -328,7 +349,9 @@ def html_guess_scope(url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata]
     return "unknown"
 
 
-def run_single(url: str, timestamp: Optional[str] = None, quick_mode: bool = False) -> IngestWebResult:
+def run_single(url: str,
+               timestamp: Optional[str] = None,
+               quick_mode: bool = False) -> IngestWebResult:
 
     adblock = load_adblock_rules()
     wayback_client = WaybackClient()
@@ -375,7 +398,8 @@ def run_single(url: str, timestamp: Optional[str] = None, quick_mode: bool = Fal
 
     full_resources: List[WebResource] = []
     if quick_mode:
-        full_resources = quick_fetch_html_resources(raw_resources, wayback_client.cdx_client, when)
+        full_resources = quick_fetch_html_resources(raw_resources, wayback_client.cdx_client,
+                                                    when)
     else:
         full_resources = fetch_html_resources(raw_resources, wayback_client, when)
 
@@ -399,14 +423,11 @@ def main() -> None:
         python -m sandcrawler.ingest_html
     """
 
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter
-    )
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
     subparsers = parser.add_subparsers()
 
-    sub = subparsers.add_parser(
-        "single", help="tries to ingest a single URL, dumps result to stdout"
-    )
+    sub = subparsers.add_parser("single",
+                                help="tries to ingest a single URL, dumps result to stdout")
     sub.set_defaults(func="run_single")
     sub.add_argument(
         "url",
@@ -437,5 +458,6 @@ def main() -> None:
         #func()
         raise NotImplementedError()
 
+
 if __name__ == "__main__":
     main()