From c4f5ba60cf3581dc58875a4e56f8964560496753 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 29 Jun 2020 17:05:32 -0700 Subject: fetch pdftotext and pdf_meta from blobs, postgrest This replaces the temporary COVID-19 content hack with production content (text, thumbnail URLs) stored in postgrest and seaweedfs. --- fatcat_scholar/sandcrawler.py | 9 ++++++ fatcat_scholar/schema.py | 9 +++--- fatcat_scholar/transform.py | 34 +++++++++------------- fatcat_scholar/work_pipeline.py | 63 +++++++++++++++++++++++++++++------------ 4 files changed, 72 insertions(+), 43 deletions(-) diff --git a/fatcat_scholar/sandcrawler.py b/fatcat_scholar/sandcrawler.py index 6f0f85c..9c48cd9 100644 --- a/fatcat_scholar/sandcrawler.py +++ b/fatcat_scholar/sandcrawler.py @@ -16,6 +16,15 @@ class SandcrawlerPostgrestClient: else: return None + def get_pdf_meta(self, sha1: str) -> Optional[Dict[str, Any]]: + resp = requests.get(self.api_url + "/pdf_meta", params=dict(sha1hex="eq." + sha1)) + resp.raise_for_status() + resp_json = resp.json() + if resp_json: + return resp_json[0] + else: + return None + class SandcrawlerMinioClient(object): def __init__( diff --git a/fatcat_scholar/schema.py b/fatcat_scholar/schema.py index d74f018..cf88011 100644 --- a/fatcat_scholar/schema.py +++ b/fatcat_scholar/schema.py @@ -7,7 +7,7 @@ auto-conversion of datetime objects. import re import datetime from enum import Enum -from typing import Optional, List, Any +from typing import Optional, List, Any, Dict import ftfy from bs4 import BeautifulSoup @@ -30,9 +30,10 @@ class IntermediateBundle(BaseModel): doc_type: DocType releases: List[ReleaseEntity] biblio_release_ident: Optional[str] - grobid_fulltext: Optional[Any] - pdftotext_fulltext: Optional[Any] - sim_fulltext: Optional[Any] + grobid_fulltext: Optional[Dict[str, Any]] + pdftotext_fulltext: Optional[Dict[str, Any]] + pdf_meta: Optional[Dict[str, Any]] + sim_fulltext: Optional[Dict[str, Any]] class Config: arbitrary_types_allowed = True diff --git a/fatcat_scholar/transform.py b/fatcat_scholar/transform.py index 847cc6e..7e7ef56 100644 --- a/fatcat_scholar/transform.py +++ b/fatcat_scholar/transform.py @@ -127,7 +127,7 @@ def es_biblio_from_sim(sim: Dict[str, Any]) -> ScholarBiblio: def _add_file_release_meta( - fulltext: ScholarFulltext, re: ReleaseEntity, fe: FileEntity + fulltext: ScholarFulltext, pdf_meta: Optional[dict], re: ReleaseEntity, fe: FileEntity ) -> ScholarFulltext: best_url = None best_url_type = None @@ -150,11 +150,14 @@ def _add_file_release_meta( fulltext.file_mimetype = fe.mimetype fulltext.access_url = best_url fulltext.access_type = best_url_type + if pdf_meta is not None and pdf_meta.get("has_page0_thumbnail"): + # eg: https://blobs.fatcat.wiki/thumbnail/pdf/32/29/322909fe57cef73b10a166996a4528d337026d16.180px.jpg + fulltext.thumbnail_url = f"https://blobs.fatcat.wiki/thumbnail/pdf/{ fe.sha1[0:2] }/{ fe.sha1[2:4] }/{ fe.sha1 }.180px.jpg" return fulltext def es_fulltext_from_grobid( - tei_xml: str, re: ReleaseEntity, fe: FileEntity + tei_xml: str, pdf_meta: Optional[dict], re: ReleaseEntity, fe: FileEntity ) -> Optional[ScholarFulltext]: obj = teixml2json(tei_xml) if not obj.get("body"): @@ -164,23 +167,21 @@ def es_fulltext_from_grobid( body=obj.get("body"), acknowledgement=obj.get("acknowledgement"), annex=obj.get("annex"), - thumbnail_url=None, # TODO: sandcrawler thumbnails ) - return _add_file_release_meta(ret, re, fe) + return _add_file_release_meta(ret, pdf_meta, re, fe) def es_fulltext_from_pdftotext( - pdftotext: Any, re: ReleaseEntity, fe: FileEntity + raw_text: str, pdf_meta: Optional[dict], re: ReleaseEntity, fe: FileEntity ) -> Optional[ScholarFulltext]: ret = ScholarFulltext( lang_code=re.language, - body=pdftotext["raw_text"], + body=raw_text, acknowledgement=None, annex=None, - thumbnail_url=None, # TODO: sandcrawler thumbnails ) - return _add_file_release_meta(ret, re, fe) + return _add_file_release_meta(ret, pdf_meta, re, fe) def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]: @@ -199,10 +200,11 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]: if heavy.doc_type == DocType.sim_page: assert ia_sim is not None + assert heavy.sim_fulltext is not None key = f"page_{ia_sim.issue_item}_{ia_sim.first_page}" sim_issue = ia_sim.issue_item biblio = es_biblio_from_sim(heavy.sim_fulltext) - fulltext = es_fulltext_from_sim(heavy.sim_fulltext) + # fulltext extracted from heavy.sim_fulltext above elif heavy.doc_type == DocType.work: work_ident = heavy.releases[0].work_id key = f"work_{work_ident}" @@ -229,19 +231,9 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]: if f.ident == heavy.grobid_fulltext["file_ident"] ][0] fulltext = es_fulltext_from_grobid( - heavy.grobid_fulltext["tei_xml"], fulltext_release, fulltext_file + heavy.grobid_fulltext["tei_xml"], heavy.pdf_meta, fulltext_release, fulltext_file ) - # hack to pull through thumbnail from local pdftotext - if ( - fulltext - and fulltext.file_sha1 - and not fulltext.thumbnail_url - and heavy.pdftotext_fulltext - ): - # https://covid19.fatcat.wiki/fulltext_web/thumbnail/c9/c9e87f843b3cf7dc47881fa3d3ccb4693d7d9521.png - fulltext.thumbnail_url = f"https://covid19.fatcat.wiki/fulltext_web/thumbnail/{fulltext.file_sha1[:2]}/{fulltext.file_sha1}.png" - if not fulltext and heavy.pdftotext_fulltext: fulltext_release = [ r @@ -254,7 +246,7 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]: if f.ident == heavy.pdftotext_fulltext["file_ident"] ][0] fulltext = es_fulltext_from_pdftotext( - heavy.pdftotext_fulltext, fulltext_release, fulltext_file + heavy.pdftotext_fulltext["raw_text"], heavy.pdf_meta, fulltext_release, fulltext_file ) # TODO: additional access list diff --git a/fatcat_scholar/work_pipeline.py b/fatcat_scholar/work_pipeline.py index 09ae02f..e3a0d8d 100644 --- a/fatcat_scholar/work_pipeline.py +++ b/fatcat_scholar/work_pipeline.py @@ -106,11 +106,11 @@ class WorkPipeline: # print(grobid_meta) try: grobid_xml = self.sandcrawler_s3_client.get_blob( + bucket="sandcrawler", + prefix="", folder="grobid", sha1hex=fe.sha1, extension=".tei.xml", - prefix="", - bucket="sandcrawler", ) # print(grobid_xml) except minio.error.NoSuchKey: @@ -119,28 +119,50 @@ class WorkPipeline: tei_xml=grobid_xml, release_ident=release_ident, file_ident=fe.ident, ) + def fetch_pdf_meta(self, fe: FileEntity, release_ident: str) -> Optional[Dict[str, Any]]: + """ + Fetches pdftext metadata from sandcrawler-db via postgrest HTTP + interface. + + Returns a JSON object on success, or None if not found. + + raw_text: str + release_ident: Optional[str] + file_ident: Optional[str] + """ + if not fe.sha1: + return None + pdf_meta = self.sandcrawler_db_client.get_pdf_meta(fe.sha1) + if not pdf_meta or pdf_meta["status"] != "success": + return None + return dict( + pdf_meta=pdf_meta, release_ident=release_ident, file_ident=fe.ident, + ) + def fetch_file_pdftotext(self, fe: FileEntity, release_ident: str) -> Optional[Any]: """ raw_text: str release_ident: Optional[str] file_ident: Optional[str] """ - # HACK: look for local pdftotext output - if self.fulltext_cache_dir: - local_txt_path = ( - f"{self.fulltext_cache_dir}/pdftotext/{fe.sha1[:2]}/{fe.sha1}.txt" + if not fe.sha1: + return None + if not fe.urls: + return None + try: + raw_text = self.sandcrawler_s3_client.get_blob( + bucket="sandcrawler", + prefix="", + folder="text", + sha1hex=fe.sha1, + extension=".txt", ) - try: - with open(local_txt_path, "r") as txt_file: - raw_text = txt_file.read() - return dict( - raw_text=raw_text, release_ident=release_ident, file_ident=fe.ident, - ) - except FileNotFoundError: - pass - except UnicodeDecodeError: - pass - return None + # print(raw_text) + except minio.error.NoSuchKey: + return None + return dict( + raw_text=raw_text, release_ident=release_ident, file_ident=fe.ident, + ) def lookup_sim(self, release: ReleaseEntity) -> Optional[SimIssueRow]: """ @@ -250,6 +272,7 @@ class WorkPipeline: # find best accessible fatcat file grobid_fulltext: Optional[Any] = None + pdf_meta: Optional[Any] = None pdftotext_fulltext: Optional[Any] = None for ident in pref_idents: release = release_dict[ident] @@ -259,7 +282,10 @@ class WorkPipeline: if not fe.sha1 or fe.mimetype not in (None, "application/pdf"): continue grobid_fulltext = self.fetch_file_grobid(fe, ident) - pdftotext_fulltext = self.fetch_file_pdftotext(fe, ident) + pdf_meta = self.fetch_pdf_meta(fe, ident) + pdftotext_fulltext = None + if pdf_meta: + pdftotext_fulltext = self.fetch_file_pdftotext(fe, ident) if grobid_fulltext or pdftotext_fulltext: break if grobid_fulltext or pdftotext_fulltext: @@ -301,6 +327,7 @@ class WorkPipeline: biblio_release_ident=pref_idents[0], grobid_fulltext=grobid_fulltext, pdftotext_fulltext=pdftotext_fulltext, + pdf_meta=pdf_meta, sim_fulltext=sim_fulltext, ) -- cgit v1.2.3