diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-06-29 17:05:32 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-06-29 17:05:35 -0700 |
commit | c4f5ba60cf3581dc58875a4e56f8964560496753 (patch) | |
tree | 43795fa1223961f28f2fac52a29e5e155a85f77f /fatcat_scholar/work_pipeline.py | |
parent | 0f8b248259b4b57e425f7420883cb141565b2b22 (diff) | |
download | fatcat-scholar-c4f5ba60cf3581dc58875a4e56f8964560496753.tar.gz fatcat-scholar-c4f5ba60cf3581dc58875a4e56f8964560496753.zip |
fetch pdftotext and pdf_meta from blobs, postgrest
This replaces the temporary COVID-19 content hack with production
content (text, thumbnail URLs) stored in postgrest and seaweedfs.
Diffstat (limited to 'fatcat_scholar/work_pipeline.py')
-rw-r--r-- | fatcat_scholar/work_pipeline.py | 63 |
1 files changed, 45 insertions, 18 deletions
diff --git a/fatcat_scholar/work_pipeline.py b/fatcat_scholar/work_pipeline.py index 09ae02f..e3a0d8d 100644 --- a/fatcat_scholar/work_pipeline.py +++ b/fatcat_scholar/work_pipeline.py @@ -106,11 +106,11 @@ class WorkPipeline: # print(grobid_meta) try: grobid_xml = self.sandcrawler_s3_client.get_blob( + bucket="sandcrawler", + prefix="", folder="grobid", sha1hex=fe.sha1, extension=".tei.xml", - prefix="", - bucket="sandcrawler", ) # print(grobid_xml) except minio.error.NoSuchKey: @@ -119,28 +119,50 @@ class WorkPipeline: tei_xml=grobid_xml, release_ident=release_ident, file_ident=fe.ident, ) + def fetch_pdf_meta(self, fe: FileEntity, release_ident: str) -> Optional[Dict[str, Any]]: + """ + Fetches pdftext metadata from sandcrawler-db via postgrest HTTP + interface. + + Returns a JSON object on success, or None if not found. + + raw_text: str + release_ident: Optional[str] + file_ident: Optional[str] + """ + if not fe.sha1: + return None + pdf_meta = self.sandcrawler_db_client.get_pdf_meta(fe.sha1) + if not pdf_meta or pdf_meta["status"] != "success": + return None + return dict( + pdf_meta=pdf_meta, release_ident=release_ident, file_ident=fe.ident, + ) + def fetch_file_pdftotext(self, fe: FileEntity, release_ident: str) -> Optional[Any]: """ raw_text: str release_ident: Optional[str] file_ident: Optional[str] """ - # HACK: look for local pdftotext output - if self.fulltext_cache_dir: - local_txt_path = ( - f"{self.fulltext_cache_dir}/pdftotext/{fe.sha1[:2]}/{fe.sha1}.txt" + if not fe.sha1: + return None + if not fe.urls: + return None + try: + raw_text = self.sandcrawler_s3_client.get_blob( + bucket="sandcrawler", + prefix="", + folder="text", + sha1hex=fe.sha1, + extension=".txt", ) - try: - with open(local_txt_path, "r") as txt_file: - raw_text = txt_file.read() - return dict( - raw_text=raw_text, release_ident=release_ident, file_ident=fe.ident, - ) - except FileNotFoundError: - pass - except UnicodeDecodeError: - pass - return None + # print(raw_text) + except minio.error.NoSuchKey: + return None + return dict( + raw_text=raw_text, release_ident=release_ident, file_ident=fe.ident, + ) def lookup_sim(self, release: ReleaseEntity) -> Optional[SimIssueRow]: """ @@ -250,6 +272,7 @@ class WorkPipeline: # find best accessible fatcat file grobid_fulltext: Optional[Any] = None + pdf_meta: Optional[Any] = None pdftotext_fulltext: Optional[Any] = None for ident in pref_idents: release = release_dict[ident] @@ -259,7 +282,10 @@ class WorkPipeline: if not fe.sha1 or fe.mimetype not in (None, "application/pdf"): continue grobid_fulltext = self.fetch_file_grobid(fe, ident) - pdftotext_fulltext = self.fetch_file_pdftotext(fe, ident) + pdf_meta = self.fetch_pdf_meta(fe, ident) + pdftotext_fulltext = None + if pdf_meta: + pdftotext_fulltext = self.fetch_file_pdftotext(fe, ident) if grobid_fulltext or pdftotext_fulltext: break if grobid_fulltext or pdftotext_fulltext: @@ -301,6 +327,7 @@ class WorkPipeline: biblio_release_ident=pref_idents[0], grobid_fulltext=grobid_fulltext, pdftotext_fulltext=pdftotext_fulltext, + pdf_meta=pdf_meta, sim_fulltext=sim_fulltext, ) |