add basic html fulltext support to fetch pipeline

author: Bryan Newbold <bnewbold@archive.org> 2020-11-06 19:51:00 -0800
committer: Bryan Newbold <bnewbold@archive.org> 2020-11-18 11:58:26 -0800
commit: 4979c58ee91903148962f4d62d1a8d423349ad67 (patch)
tree: 09901de13601d058eb413614a0dc626e1e30f4d2
parent: 7d38f46fc1970decfcfb1e3f4583b85605e5b8ee (diff)
download: fatcat-scholar-4979c58ee91903148962f4d62d1a8d423349ad67.tar.gz
fatcat-scholar-4979c58ee91903148962f4d62d1a8d423349ad67.zip
3 files changed, 58 insertions, 2 deletions
diff --git a/fatcat_scholar/sandcrawler.py b/fatcat_scholar/sandcrawler.py
index 25c7002..416ed83 100644
--- a/fatcat_scholar/sandcrawler.py
+++ b/fatcat_scholar/sandcrawler.py
@@ -27,6 +27,17 @@ class SandcrawlerPostgrestClient:
         else:
             return None
 
+    def get_html_meta(self, sha1: str) -> Optional[Dict[str, Any]]:
+        resp = requests.get(
+            self.api_url + "/html_meta", params=dict(sha1hex="eq." + sha1)
+        )
+        resp.raise_for_status()
+        resp_json = resp.json()
+        if resp_json:
+            return resp_json[0]
+        else:
+            return None
+
 
 class SandcrawlerMinioClient(object):
     def __init__(
diff --git a/fatcat_scholar/schema.py b/fatcat_scholar/schema.py
index 5637e0a..bec81ab 100644
--- a/fatcat_scholar/schema.py
+++ b/fatcat_scholar/schema.py
@@ -33,6 +33,7 @@ class IntermediateBundle(BaseModel):
     grobid_fulltext: Optional[Dict[str, Any]]
     pdftotext_fulltext: Optional[Dict[str, Any]]
     pdf_meta: Optional[Dict[str, Any]]
+    html_fulltext: Optional[Dict[str, Any]]
     sim_fulltext: Optional[Dict[str, Any]]
     fetched: Optional[datetime.datetime]
 
diff --git a/fatcat_scholar/work_pipeline.py b/fatcat_scholar/work_pipeline.py
index 631bda8..ea33a01 100644
--- a/fatcat_scholar/work_pipeline.py
+++ b/fatcat_scholar/work_pipeline.py
@@ -7,7 +7,7 @@ from typing import List, Dict, Tuple, Optional, Any, Sequence
 import minio
 import requests
 import internetarchive
-from fatcat_openapi_client import ReleaseEntity, FileEntity
+from fatcat_openapi_client import ReleaseEntity, FileEntity, WebcaptureEntity
 
 from fatcat_scholar.api_entities import *
 from fatcat_scholar.config import settings
@@ -169,6 +169,41 @@ class WorkPipeline:
             raw_text=raw_text, release_ident=release_ident, file_ident=fe.ident,
         )
 
+    def fetch_webcapture_html_fulltext(
+        self, wc: WebcaptureEntity, release_ident: str,
+    ) -> Optional[Dict[str, Any]]:
+
+        primary_resources = [cdx for cdx in wc.cdx if cdx.url == wc.original_url]
+        if not primary_resources or primary_resources[0].mimetype != "text/html":
+            return None
+        html_meta = self.sandcrawler_db_client.get_html_meta(primary_resources[0].sha1)
+        if not html_meta:
+            return None
+        sha1hex = html_meta.get("sha1hex")
+        if not sha1hex:
+            return None
+        if html_meta.get("status") != "success" or not html_meta.get("has_teixml"):
+            return None
+
+        try:
+            tei_xml = self.sandcrawler_s3_client.get_blob(
+                bucket="sandcrawler",
+                prefix="",
+                folder="html_body",
+                sha1hex=sha1hex,
+                extension=".tei.xml",
+            )
+            # print(grobid_xml)
+        except minio.error.NoSuchKey:
+            return None
+
+        return dict(
+            html_meta=html_meta,
+            tei_xml=tei_xml,
+            release_ident=release_ident,
+            webcapture_ident=wc.ident,
+        )
+
     def lookup_sim(self, release: ReleaseEntity) -> Optional[SimIssueRow]:
         """
         Checks in IssueDB to see if this release is likely to have a copy in a
@@ -279,6 +314,7 @@ class WorkPipeline:
         grobid_fulltext: Optional[Any] = None
         pdf_meta: Optional[Any] = None
         pdftotext_fulltext: Optional[Any] = None
+        html_fulltext: Optional[Any] = None
         for ident in pref_idents:
             release = release_dict[ident]
             if not release.files:
@@ -295,7 +331,14 @@ class WorkPipeline:
                     pdftotext_fulltext = self.fetch_file_pdftotext(fe, ident)
                 if grobid_fulltext or pdftotext_fulltext:
                     break
-            if grobid_fulltext or pdftotext_fulltext:
+                pdf_meta = None
+            for wc in release.webcaptures:
+                # find primary web capture object
+                html_fulltext = self.fetch_webcapture_html_fulltext(wc, ident)
+                if html_fulltext and html_fulltext.get("tei_xml"):
+                    break
+                html_fulltext = None
+            if grobid_fulltext or pdftotext_fulltext or html_fulltext:
                 break
 
         # find best accessible SIM metadata and fulltext
@@ -335,6 +378,7 @@ class WorkPipeline:
             grobid_fulltext=grobid_fulltext,
             pdftotext_fulltext=pdftotext_fulltext,
             pdf_meta=pdf_meta,
+            html_fulltext=html_fulltext,
             sim_fulltext=sim_fulltext,
         )
author	Bryan Newbold <bnewbold@archive.org>	2020-11-06 19:51:00 -0800
committer	Bryan Newbold <bnewbold@archive.org>	2020-11-18 11:58:26 -0800
commit	4979c58ee91903148962f4d62d1a8d423349ad67 (patch)
tree	09901de13601d058eb413614a0dc626e1e30f4d2
parent	7d38f46fc1970decfcfb1e3f4583b85605e5b8ee (diff)
download	fatcat-scholar-4979c58ee91903148962f4d62d1a8d423349ad67.tar.gz fatcat-scholar-4979c58ee91903148962f4d62d1a8d423349ad67.zip