From c14e3a9d0ef68822da347e59e77c4b2b9a7b0828 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@archive.org>
Date: Fri, 6 Nov 2020 20:55:44 -0800
Subject: basic HTML transform/index support

---
 fatcat_scholar/transform.py | 48 +++++++++++++++++++++++++++++++++++++++++++--
 fatcat_scholar/worker.py    |  1 +
 2 files changed, 47 insertions(+), 2 deletions(-)

(limited to 'fatcat_scholar')

diff --git a/fatcat_scholar/transform.py b/fatcat_scholar/transform.py
index e92e34a..41ed417 100644
--- a/fatcat_scholar/transform.py
+++ b/fatcat_scholar/transform.py
@@ -1,9 +1,10 @@
 import sys
 import argparse
 import datetime
+import xml.etree.ElementTree as ET
 from typing import List, Dict, Optional, Any, Sequence
 
-from fatcat_openapi_client import ReleaseEntity, FileEntity
+from fatcat_openapi_client import ReleaseEntity, FileEntity, WebcaptureEntity
 
 from fatcat_scholar.api_entities import *
 from fatcat_scholar.schema import *
@@ -230,6 +231,36 @@ def es_fulltext_from_pdftotext(
     return _add_file_release_meta(ret, pdf_meta, re, fe)
 
 
+def es_fulltext_from_html(
+    html_fulltext: Dict[str, Any], re: ReleaseEntity, wc: WebcaptureEntity,
+) -> Optional[ScholarFulltext]:
+
+    if not wc.archive_urls or not html_fulltext.get("tei_xml"):
+        return None
+
+    ns = {"tei": "http://www.tei-c.org/ns/1.0"}
+    tree = ET.fromstring(html_fulltext["tei_xml"])
+    body = tree.find(".//tei:body", ns)
+    if body:
+        raw_text = " ".join(body.itertext())
+    else:
+        return None
+
+    ret = ScholarFulltext(
+        lang_code=re.language,
+        body=raw_text,
+        acknowledgement=None,
+        annex=None,
+        release_ident=re.ident,
+        # webcapture_ident=wc.ident,
+        file_sha1=html_fulltext.get("html_meta", {}).get("sha1hex"),
+        file_mimetype="text/html",
+        access_url=wc.archive_urls[0].url,
+        access_type=AccessType.wayback,
+    )
+    return ret
+
+
 def biblio_metadata_hacks(biblio: ScholarBiblio) -> ScholarBiblio:  # noqa: C901
     """
     This function does platform/publisher specific metadata hacks.
@@ -435,7 +466,20 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]:
             fulltext_file,
         )
 
-    # TODO: additional access list
+    if not fulltext and heavy.html_fulltext:
+        fulltext_release = [
+            r for r in heavy.releases if r.ident == heavy.html_fulltext["release_ident"]
+        ][0]
+        fulltext_webcapture = [
+            f
+            for f in fulltext_release.webcaptures
+            if f.ident == heavy.html_fulltext["webcapture_ident"]
+        ][0]
+        fulltext = es_fulltext_from_html(
+            heavy.html_fulltext, fulltext_release, fulltext_webcapture,
+        )
+
+    # TODO: additional access list (eg, HTML if only PDF currently)
     access_dict = dict()
     if fulltext and fulltext.access_type:
         access_dict[fulltext.access_type] = ScholarAccess(
diff --git a/fatcat_scholar/worker.py b/fatcat_scholar/worker.py
index af84dd1..d2cc3cb 100644
--- a/fatcat_scholar/worker.py
+++ b/fatcat_scholar/worker.py
@@ -131,6 +131,7 @@ class IndexDocsWorker(KafkaWorker):
                 grobid_fulltext=obj.get("grobid_fulltext"),
                 pdftotext_fulltext=obj.get("pdftotext_fulltext"),
                 pdf_meta=obj.get("pdf_meta"),
+                html_fulltext=obj.get("html_fulltext"),
                 sim_fulltext=obj.get("sim_fulltext"),
             )
             es_doc = transform_heavy(bundle)
-- 
cgit v1.2.3