fetch pdftotext and pdf_meta from blobs, postgrest

This replaces the temporary COVID-19 content hack with production content (text, thumbnail URLs) stored in postgrest and seaweedfs.
author: Bryan Newbold <bnewbold@archive.org> 2020-06-29 17:05:32 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2020-06-29 17:05:35 -0700
commit: c4f5ba60cf3581dc58875a4e56f8964560496753 (patch)
tree: 43795fa1223961f28f2fac52a29e5e155a85f77f /fatcat_scholar
parent: 0f8b248259b4b57e425f7420883cb141565b2b22 (diff)
download: fatcat-scholar-c4f5ba60cf3581dc58875a4e56f8964560496753.tar.gz
fatcat-scholar-c4f5ba60cf3581dc58875a4e56f8964560496753.zip
4 files changed, 72 insertions, 43 deletions
diff --git a/fatcat_scholar/sandcrawler.py b/fatcat_scholar/sandcrawler.py
index 6f0f85c..9c48cd9 100644
--- a/fatcat_scholar/sandcrawler.py
+++ b/fatcat_scholar/sandcrawler.py
@@ -16,6 +16,15 @@ class SandcrawlerPostgrestClient:
         else:
             return None
 
+    def get_pdf_meta(self, sha1: str) -> Optional[Dict[str, Any]]:
+        resp = requests.get(self.api_url + "/pdf_meta", params=dict(sha1hex="eq." + sha1))
+        resp.raise_for_status()
+        resp_json = resp.json()
+        if resp_json:
+            return resp_json[0]
+        else:
+            return None
+
 
 class SandcrawlerMinioClient(object):
     def __init__(
diff --git a/fatcat_scholar/schema.py b/fatcat_scholar/schema.py
index d74f018..cf88011 100644
--- a/fatcat_scholar/schema.py
+++ b/fatcat_scholar/schema.py
@@ -7,7 +7,7 @@ auto-conversion of datetime objects.
 import re
 import datetime
 from enum import Enum
-from typing import Optional, List, Any
+from typing import Optional, List, Any, Dict
 
 import ftfy
 from bs4 import BeautifulSoup
@@ -30,9 +30,10 @@ class IntermediateBundle(BaseModel):
     doc_type: DocType
     releases: List[ReleaseEntity]
     biblio_release_ident: Optional[str]
-    grobid_fulltext: Optional[Any]
-    pdftotext_fulltext: Optional[Any]
-    sim_fulltext: Optional[Any]
+    grobid_fulltext: Optional[Dict[str, Any]]
+    pdftotext_fulltext: Optional[Dict[str, Any]]
+    pdf_meta: Optional[Dict[str, Any]]
+    sim_fulltext: Optional[Dict[str, Any]]
 
     class Config:
         arbitrary_types_allowed = True
diff --git a/fatcat_scholar/transform.py b/fatcat_scholar/transform.py
index 847cc6e..7e7ef56 100644
--- a/fatcat_scholar/transform.py
+++ b/fatcat_scholar/transform.py
@@ -127,7 +127,7 @@ def es_biblio_from_sim(sim: Dict[str, Any]) -> ScholarBiblio:
 
 
 def _add_file_release_meta(
-    fulltext: ScholarFulltext, re: ReleaseEntity, fe: FileEntity
+    fulltext: ScholarFulltext, pdf_meta: Optional[dict], re: ReleaseEntity, fe: FileEntity
 ) -> ScholarFulltext:
     best_url = None
     best_url_type = None
@@ -150,11 +150,14 @@ def _add_file_release_meta(
     fulltext.file_mimetype = fe.mimetype
     fulltext.access_url = best_url
     fulltext.access_type = best_url_type
+    if pdf_meta is not None and pdf_meta.get("has_page0_thumbnail"):
+        # eg: https://blobs.fatcat.wiki/thumbnail/pdf/32/29/322909fe57cef73b10a166996a4528d337026d16.180px.jpg
+        fulltext.thumbnail_url = f"https://blobs.fatcat.wiki/thumbnail/pdf/{ fe.sha1[0:2] }/{ fe.sha1[2:4] }/{ fe.sha1 }.180px.jpg"
     return fulltext
 
 
 def es_fulltext_from_grobid(
-    tei_xml: str, re: ReleaseEntity, fe: FileEntity
+    tei_xml: str, pdf_meta: Optional[dict], re: ReleaseEntity, fe: FileEntity
 ) -> Optional[ScholarFulltext]:
     obj = teixml2json(tei_xml)
     if not obj.get("body"):
@@ -164,23 +167,21 @@ def es_fulltext_from_grobid(
         body=obj.get("body"),
         acknowledgement=obj.get("acknowledgement"),
         annex=obj.get("annex"),
-        thumbnail_url=None,  # TODO: sandcrawler thumbnails
     )
-    return _add_file_release_meta(ret, re, fe)
+    return _add_file_release_meta(ret, pdf_meta, re, fe)
 
 
 def es_fulltext_from_pdftotext(
-    pdftotext: Any, re: ReleaseEntity, fe: FileEntity
+    raw_text: str, pdf_meta: Optional[dict], re: ReleaseEntity, fe: FileEntity
 ) -> Optional[ScholarFulltext]:
 
     ret = ScholarFulltext(
         lang_code=re.language,
-        body=pdftotext["raw_text"],
+        body=raw_text,
         acknowledgement=None,
         annex=None,
-        thumbnail_url=None,  # TODO: sandcrawler thumbnails
     )
-    return _add_file_release_meta(ret, re, fe)
+    return _add_file_release_meta(ret, pdf_meta, re, fe)
 
 
 def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]:
@@ -199,10 +200,11 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]:
 
     if heavy.doc_type == DocType.sim_page:
         assert ia_sim is not None
+        assert heavy.sim_fulltext is not None
         key = f"page_{ia_sim.issue_item}_{ia_sim.first_page}"
         sim_issue = ia_sim.issue_item
         biblio = es_biblio_from_sim(heavy.sim_fulltext)
-        fulltext = es_fulltext_from_sim(heavy.sim_fulltext)
+        # fulltext extracted from heavy.sim_fulltext above
     elif heavy.doc_type == DocType.work:
         work_ident = heavy.releases[0].work_id
         key = f"work_{work_ident}"
@@ -229,19 +231,9 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]:
             if f.ident == heavy.grobid_fulltext["file_ident"]
         ][0]
         fulltext = es_fulltext_from_grobid(
-            heavy.grobid_fulltext["tei_xml"], fulltext_release, fulltext_file
+            heavy.grobid_fulltext["tei_xml"], heavy.pdf_meta, fulltext_release, fulltext_file
         )
 
-        # hack to pull through thumbnail from local pdftotext
-        if (
-            fulltext
-            and fulltext.file_sha1
-            and not fulltext.thumbnail_url
-            and heavy.pdftotext_fulltext
-        ):
-            # https://covid19.fatcat.wiki/fulltext_web/thumbnail/c9/c9e87f843b3cf7dc47881fa3d3ccb4693d7d9521.png
-            fulltext.thumbnail_url = f"https://covid19.fatcat.wiki/fulltext_web/thumbnail/{fulltext.file_sha1[:2]}/{fulltext.file_sha1}.png"
-
     if not fulltext and heavy.pdftotext_fulltext:
         fulltext_release = [
             r
@@ -254,7 +246,7 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]:
             if f.ident == heavy.pdftotext_fulltext["file_ident"]
         ][0]
         fulltext = es_fulltext_from_pdftotext(
-            heavy.pdftotext_fulltext, fulltext_release, fulltext_file
+            heavy.pdftotext_fulltext["raw_text"], heavy.pdf_meta, fulltext_release, fulltext_file
         )
 
     # TODO: additional access list
diff --git a/fatcat_scholar/work_pipeline.py b/fatcat_scholar/work_pipeline.py
index 09ae02f..e3a0d8d 100644
--- a/fatcat_scholar/work_pipeline.py
+++ b/fatcat_scholar/work_pipeline.py
@@ -106,11 +106,11 @@ class WorkPipeline:
         # print(grobid_meta)
         try:
             grobid_xml = self.sandcrawler_s3_client.get_blob(
+                bucket="sandcrawler",
+                prefix="",
                 folder="grobid",
                 sha1hex=fe.sha1,
                 extension=".tei.xml",
-                prefix="",
-                bucket="sandcrawler",
             )
             # print(grobid_xml)
         except minio.error.NoSuchKey:
@@ -119,28 +119,50 @@ class WorkPipeline:
             tei_xml=grobid_xml, release_ident=release_ident, file_ident=fe.ident,
         )
 
+    def fetch_pdf_meta(self, fe: FileEntity, release_ident: str) -> Optional[Dict[str, Any]]:
+        """
+        Fetches pdftext metadata from sandcrawler-db via postgrest HTTP
+        interface.
+
+        Returns a JSON object on success, or None if not found.
+
+        raw_text: str
+        release_ident: Optional[str]
+        file_ident: Optional[str]
+        """
+        if not fe.sha1:
+            return None
+        pdf_meta = self.sandcrawler_db_client.get_pdf_meta(fe.sha1)
+        if not pdf_meta or pdf_meta["status"] != "success":
+            return None
+        return dict(
+            pdf_meta=pdf_meta, release_ident=release_ident, file_ident=fe.ident,
+        )
+
     def fetch_file_pdftotext(self, fe: FileEntity, release_ident: str) -> Optional[Any]:
         """
         raw_text: str
         release_ident: Optional[str]
         file_ident: Optional[str]
         """
-        # HACK: look for local pdftotext output
-        if self.fulltext_cache_dir:
-            local_txt_path = (
-                f"{self.fulltext_cache_dir}/pdftotext/{fe.sha1[:2]}/{fe.sha1}.txt"
+        if not fe.sha1:
+            return None
+        if not fe.urls:
+            return None
+        try:
+            raw_text = self.sandcrawler_s3_client.get_blob(
+                bucket="sandcrawler",
+                prefix="",
+                folder="text",
+                sha1hex=fe.sha1,
+                extension=".txt",
             )
-            try:
-                with open(local_txt_path, "r") as txt_file:
-                    raw_text = txt_file.read()
-                return dict(
-                    raw_text=raw_text, release_ident=release_ident, file_ident=fe.ident,
-                )
-            except FileNotFoundError:
-                pass
-            except UnicodeDecodeError:
-                pass
-        return None
+            # print(raw_text)
+        except minio.error.NoSuchKey:
+            return None
+        return dict(
+            raw_text=raw_text, release_ident=release_ident, file_ident=fe.ident,
+        )
 
     def lookup_sim(self, release: ReleaseEntity) -> Optional[SimIssueRow]:
         """
@@ -250,6 +272,7 @@ class WorkPipeline:
 
         # find best accessible fatcat file
         grobid_fulltext: Optional[Any] = None
+        pdf_meta: Optional[Any] = None
         pdftotext_fulltext: Optional[Any] = None
         for ident in pref_idents:
             release = release_dict[ident]
@@ -259,7 +282,10 @@ class WorkPipeline:
                 if not fe.sha1 or fe.mimetype not in (None, "application/pdf"):
                     continue
                 grobid_fulltext = self.fetch_file_grobid(fe, ident)
-                pdftotext_fulltext = self.fetch_file_pdftotext(fe, ident)
+                pdf_meta = self.fetch_pdf_meta(fe, ident)
+                pdftotext_fulltext = None
+                if pdf_meta:
+                    pdftotext_fulltext = self.fetch_file_pdftotext(fe, ident)
                 if grobid_fulltext or pdftotext_fulltext:
                     break
             if grobid_fulltext or pdftotext_fulltext:
@@ -301,6 +327,7 @@ class WorkPipeline:
             biblio_release_ident=pref_idents[0],
             grobid_fulltext=grobid_fulltext,
             pdftotext_fulltext=pdftotext_fulltext,
+            pdf_meta=pdf_meta,
             sim_fulltext=sim_fulltext,
         )
author	Bryan Newbold <bnewbold@archive.org>	2020-06-29 17:05:32 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2020-06-29 17:05:35 -0700
commit	c4f5ba60cf3581dc58875a4e56f8964560496753 (patch)
tree	43795fa1223961f28f2fac52a29e5e155a85f77f /fatcat_scholar
parent	0f8b248259b4b57e425f7420883cb141565b2b22 (diff)
download	fatcat-scholar-c4f5ba60cf3581dc58875a4e56f8964560496753.tar.gz fatcat-scholar-c4f5ba60cf3581dc58875a4e56f8964560496753.zip