1 files changed, 82 insertions, 57 deletions
diff --git a/python/sandcrawler/ingest_html.py b/python/sandcrawler/ingest_html.py
index 91e5c6e..0ff7fe0 100644
--- a/python/sandcrawler/ingest_html.py
+++ b/python/sandcrawler/ingest_html.py
@@ -9,12 +9,26 @@ import pydantic
 import trafilatura
 from selectolax.parser import HTMLParser
 
-from sandcrawler.html_metadata import (BiblioMetadata, html_extract_biblio,
-                                       html_extract_resources, load_adblock_rules)
-from sandcrawler.ia import (CdxApiClient, NoCaptureError, WaybackClient, WaybackContentError,
-                            cdx_to_dict, fix_transfer_encoding)
-from sandcrawler.misc import (datetime_to_cdx, gen_file_metadata, parse_cdx_datetime,
-                              url_fuzzy_equal)
+from sandcrawler.html_metadata import (
+    BiblioMetadata,
+    html_extract_biblio,
+    html_extract_resources,
+    load_adblock_rules,
+)
+from sandcrawler.ia import (
+    CdxApiClient,
+    NoCaptureError,
+    WaybackClient,
+    WaybackContentError,
+    cdx_to_dict,
+    fix_transfer_encoding,
+)
+from sandcrawler.misc import (
+    datetime_to_cdx,
+    gen_file_metadata,
+    parse_cdx_datetime,
+    url_fuzzy_equal,
+)
 
 TRAFILATURA_AGENT = f"trafilatura/{trafilatura.__version__}"
 
@@ -23,7 +37,7 @@ def html_extract_body_teixml(doc: bytes) -> dict:
     try:
         tei_xml = trafilatura.extract(
             doc,
-            output_format='xmltei',
+            output_format="xmltei",
             include_comments=False,
             include_formatting=True,
         )
@@ -35,12 +49,11 @@ def html_extract_body_teixml(doc: bytes) -> dict:
     if tei_xml:
         body_txt = teixml_body_text(tei_xml)
         word_count = len(body_txt.split())
-        return dict(status="success",
-                    agent=TRAFILATURA_AGENT,
-                    tei_xml=tei_xml,
-                    word_count=word_count)
+        return dict(
+            status="success", agent=TRAFILATURA_AGENT, tei_xml=tei_xml, word_count=word_count
+        )
     elif doc.startswith(
-            b'<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" 2012"http://www.w3.org/TR/html4/loose.dtd">'
+        b'<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" 2012"http://www.w3.org/TR/html4/loose.dtd">'
     ):
         # hack for firstmonday.org
         return html_extract_body_teixml(doc[106:])
@@ -51,7 +64,7 @@ def html_extract_body_teixml(doc: bytes) -> dict:
 def teixml_body_text(doc_xml: str) -> str:
     ns = {"tei": "http://www.tei-c.org/ns/1.0"}
     tree = ET.fromstring(doc_xml)
-    body = tree.find('.//tei:body', ns)
+    body = tree.find(".//tei:body", ns)
     if body:
         return " ".join(body.itertext())
     else:
@@ -126,8 +139,9 @@ class HtmlMetaRow(pydantic.BaseModel):
         )
 
 
-def quick_fetch_html_resources(resources: List[dict], cdx_client: CdxApiClient,
-                               when: Optional[datetime.datetime]) -> List[WebResource]:
+def quick_fetch_html_resources(
+    resources: List[dict], cdx_client: CdxApiClient, when: Optional[datetime.datetime]
+) -> List[WebResource]:
     """
     This is the lazy version that just does a CDX lookup for each resource.
 
@@ -138,12 +152,13 @@ def quick_fetch_html_resources(resources: List[dict], cdx_client: CdxApiClient,
     full = []
     closest = when and datetime_to_cdx(when)
     for resource in resources:
-        cdx_row = cdx_client.lookup_best(resource['url'], closest=closest)
+        cdx_row = cdx_client.lookup_best(resource["url"], closest=closest)
         if not cdx_row:
             raise NoCaptureError(f"HTML sub-resource not found: {resource['url']}")
-        if cdx_row.url != resource['url'] and not url_fuzzy_equal(cdx_row.url, resource['url']):
-            print(f"  WARN: CDX fuzzy match: {cdx_row.url} != {resource['url']}",
-                  file=sys.stderr)
+        if cdx_row.url != resource["url"] and not url_fuzzy_equal(cdx_row.url, resource["url"]):
+            print(
+                f"  WARN: CDX fuzzy match: {cdx_row.url} != {resource['url']}", file=sys.stderr
+            )
         if not cdx_row.status_code:
             # TODO: fall back to a full fetch?
             print("  WARN: skipping revisit record", file=sys.stderr)
@@ -158,14 +173,16 @@ def quick_fetch_html_resources(resources: List[dict], cdx_client: CdxApiClient,
                 status_code=cdx_row.status_code,
                 size=None,
                 sha256hex=None,
-                resource_type=resource['type'],
-            ))
+                resource_type=resource["type"],
+            )
+        )
 
     return full
 
 
-def fetch_html_resources(resources: List[dict], wayback_client: WaybackClient,
-                         when: Optional[datetime.datetime]) -> List[WebResource]:
+def fetch_html_resources(
+    resources: List[dict], wayback_client: WaybackClient, when: Optional[datetime.datetime]
+) -> List[WebResource]:
     """
     This is the full version which fetches each resource from wayback/petabox
     and calculates additional hashes.
@@ -176,11 +193,11 @@ def fetch_html_resources(resources: List[dict], wayback_client: WaybackClient,
     full = []
     closest = when and datetime_to_cdx(when)
     for resource in resources:
-        wayback_resp = wayback_client.lookup_resource(resource['url'], closest=closest)
-        if not wayback_resp or wayback_resp.status != 'success':
+        wayback_resp = wayback_client.lookup_resource(resource["url"], closest=closest)
+        if not wayback_resp or wayback_resp.status != "success":
             raise NoCaptureError(f"HTML sub-resource not found: {resource['url']}")
         file_meta = gen_file_metadata(wayback_resp.body, allow_empty=True)
-        if file_meta['sha1hex'] != wayback_resp.cdx.sha1hex:
+        if file_meta["sha1hex"] != wayback_resp.cdx.sha1hex:
             raise WaybackContentError(
                 f"wayback payload sha1hex mismatch: {wayback_resp.cdx.datetime} {wayback_resp.cdx.url}"
             )
@@ -189,25 +206,27 @@ def fetch_html_resources(resources: List[dict], wayback_client: WaybackClient,
                 surt=wayback_resp.cdx.surt,
                 timestamp=parse_cdx_datetime(wayback_resp.cdx.datetime),
                 url=wayback_resp.cdx.url,
-                sha1hex=file_meta['sha1hex'],
-                mimetype=file_meta['mimetype'],
+                sha1hex=file_meta["sha1hex"],
+                mimetype=file_meta["mimetype"],
                 status_code=wayback_resp.cdx.status_code
                 or wayback_resp.revisit_cdx.status_code,
-                size=file_meta['size_bytes'],
-                sha256hex=file_meta['sha256hex'],
-                resource_type=resource['type'],
-            ))
+                size=file_meta["size_bytes"],
+                sha256hex=file_meta["sha256hex"],
+                resource_type=resource["type"],
+            )
+        )
 
     return full
 
 
-def html_guess_platform(url: str, doc: HTMLParser,
-                        biblio: Optional[BiblioMetadata]) -> Optional[str]:
+def html_guess_platform(
+    url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata]
+) -> Optional[str]:
 
     generator: Optional[str] = None
     generator_elem = doc.css_first("meta[name='generator']")
     if generator_elem:
-        generator = generator_elem.attrs['content']
+        generator = generator_elem.attrs["content"]
     else:
         generator_elem = doc.css_first("a[id='developedBy']")
         if generator_elem:
@@ -226,7 +245,10 @@ def html_guess_platform(url: str, doc: HTMLParser,
         return "ojs"
     else:
         try:
-            if 'powered by <a target="blank" href="http://pkp.sfu.ca/ojs/">PKP OJS</a>' in doc.html:
+            if (
+                'powered by <a target="blank" href="http://pkp.sfu.ca/ojs/">PKP OJS</a>'
+                in doc.html
+            ):
                 return "ojs"
             if 'Powered by <a target="_blank" href="http://arphahub.com">' in doc.html:
                 return "arpha"
@@ -236,20 +258,21 @@ def html_guess_platform(url: str, doc: HTMLParser,
             pass
 
     icon_elem = doc.css_first("link[type='image/x-icon']")
-    if icon_elem and 'href' in icon_elem.attrs:
-        if 'journalssystem.com' in icon_elem.attrs['href']:
+    if icon_elem and "href" in icon_elem.attrs:
+        if "journalssystem.com" in icon_elem.attrs["href"]:
             return "journalssystem.com"
-        elif 'indexcopernicus.com' in icon_elem.attrs['href']:
+        elif "indexcopernicus.com" in icon_elem.attrs["href"]:
             return "indexcopernicus"
 
-    if 'scielo' in url:
+    if "scielo" in url:
         return "scielo"
 
     return None
 
 
-def html_guess_scope(url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata],
-                     word_count: Optional[int]) -> str:
+def html_guess_scope(
+    url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata], word_count: Optional[int]
+) -> str:
     """
     This function tries to guess if an HTML document represents one of:
 
@@ -275,7 +298,7 @@ def html_guess_scope(url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata]
     """
 
     # assert that this is a real URL
-    assert url.count('/') >= 2
+    assert url.count("/") >= 2
 
     # basic paywall and loginwall detection based on URL
     if url.endswith("/cookieAbsent"):
@@ -293,7 +316,7 @@ def html_guess_scope(url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata]
         return "blocked-captcha"
 
     # is this the top-level URL of the domain? aka, no path?
-    if url.count('/') <= 2 or (url.count('/') == 3) and url.endswith('/'):
+    if url.count("/") <= 2 or (url.count("/") == 3) and url.endswith("/"):
         return "homepage-domain"
 
     platform = html_guess_platform(url, doc, biblio)
@@ -340,7 +363,7 @@ def html_guess_scope(url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata]
     if word_count is not None:
         if word_count < 20:
             return "stub"
-        elif word_count > 500 and platform in ['wordpress', 'blogger']:
+        elif word_count > 500 and platform in ["wordpress", "blogger"]:
             return "article-fulltext"
         elif word_count > 1200:
             return "article-fulltext"
@@ -348,9 +371,9 @@ def html_guess_scope(url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata]
     return "unknown"
 
 
-def run_single(url: str,
-               timestamp: Optional[str] = None,
-               quick_mode: bool = False) -> IngestWebResult:
+def run_single(
+    url: str, timestamp: Optional[str] = None, quick_mode: bool = False
+) -> IngestWebResult:
 
     adblock = load_adblock_rules()
     wayback_client = WaybackClient()
@@ -368,7 +391,7 @@ def run_single(url: str,
     file_meta = gen_file_metadata(html_resource.body)
     file_meta, html_resource = fix_transfer_encoding(file_meta, html_resource)
 
-    if file_meta['mimetype'] not in ("text/html", "text/xml"):
+    if file_meta["mimetype"] not in ("text/html", "text/xml"):
         return IngestWebResult(
             status="wrong-mimetype",
             hit=False,
@@ -379,8 +402,8 @@ def run_single(url: str,
     html_doc = HTMLParser(html_resource.body)
     html_biblio = html_extract_biblio(url, html_doc)
     html_body = html_extract_body_teixml(html_resource.body)
-    html_scope = html_guess_scope(url, html_doc, html_biblio, html_body.get('word_count'))
-    if html_scope not in ('article-fulltext', 'unknown'):
+    html_scope = html_guess_scope(url, html_doc, html_biblio, html_body.get("word_count"))
+    if html_scope not in ("article-fulltext", "unknown"):
         return IngestWebResult(
             status="wrong-scope",
             hit=False,
@@ -397,8 +420,9 @@ def run_single(url: str,
 
     full_resources: List[WebResource] = []
     if quick_mode:
-        full_resources = quick_fetch_html_resources(raw_resources, wayback_client.cdx_client,
-                                                    when)
+        full_resources = quick_fetch_html_resources(
+            raw_resources, wayback_client.cdx_client, when
+        )
     else:
         full_resources = fetch_html_resources(raw_resources, wayback_client, when)
 
@@ -425,8 +449,9 @@ def main() -> None:
     parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
     subparsers = parser.add_subparsers()
 
-    sub = subparsers.add_parser("single",
-                                help="tries to ingest a single URL, dumps result to stdout")
+    sub = subparsers.add_parser(
+        "single", help="tries to ingest a single URL, dumps result to stdout"
+    )
     sub.set_defaults(func="run_single")
     sub.add_argument(
         "url",
@@ -453,8 +478,8 @@ def main() -> None:
         result = run_single(args.url, args.timestamp, args.quick_mode)
         print(result.json(indent=2, exclude_none=True))
     else:
-        #func = getattr(wp, args.func)
-        #func()
+        # func = getattr(wp, args.func)
+        # func()
         raise NotImplementedError()