aboutsummaryrefslogtreecommitdiffstats
path: root/fatcat_scholar/work_pipeline.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-06-29 17:05:32 -0700
committerBryan Newbold <bnewbold@archive.org>2020-06-29 17:05:35 -0700
commitc4f5ba60cf3581dc58875a4e56f8964560496753 (patch)
tree43795fa1223961f28f2fac52a29e5e155a85f77f /fatcat_scholar/work_pipeline.py
parent0f8b248259b4b57e425f7420883cb141565b2b22 (diff)
downloadfatcat-scholar-c4f5ba60cf3581dc58875a4e56f8964560496753.tar.gz
fatcat-scholar-c4f5ba60cf3581dc58875a4e56f8964560496753.zip
fetch pdftotext and pdf_meta from blobs, postgrest
This replaces the temporary COVID-19 content hack with production content (text, thumbnail URLs) stored in postgrest and seaweedfs.
Diffstat (limited to 'fatcat_scholar/work_pipeline.py')
-rw-r--r--fatcat_scholar/work_pipeline.py63
1 files changed, 45 insertions, 18 deletions
diff --git a/fatcat_scholar/work_pipeline.py b/fatcat_scholar/work_pipeline.py
index 09ae02f..e3a0d8d 100644
--- a/fatcat_scholar/work_pipeline.py
+++ b/fatcat_scholar/work_pipeline.py
@@ -106,11 +106,11 @@ class WorkPipeline:
# print(grobid_meta)
try:
grobid_xml = self.sandcrawler_s3_client.get_blob(
+ bucket="sandcrawler",
+ prefix="",
folder="grobid",
sha1hex=fe.sha1,
extension=".tei.xml",
- prefix="",
- bucket="sandcrawler",
)
# print(grobid_xml)
except minio.error.NoSuchKey:
@@ -119,28 +119,50 @@ class WorkPipeline:
tei_xml=grobid_xml, release_ident=release_ident, file_ident=fe.ident,
)
+ def fetch_pdf_meta(self, fe: FileEntity, release_ident: str) -> Optional[Dict[str, Any]]:
+ """
+ Fetches pdftext metadata from sandcrawler-db via postgrest HTTP
+ interface.
+
+ Returns a JSON object on success, or None if not found.
+
+ raw_text: str
+ release_ident: Optional[str]
+ file_ident: Optional[str]
+ """
+ if not fe.sha1:
+ return None
+ pdf_meta = self.sandcrawler_db_client.get_pdf_meta(fe.sha1)
+ if not pdf_meta or pdf_meta["status"] != "success":
+ return None
+ return dict(
+ pdf_meta=pdf_meta, release_ident=release_ident, file_ident=fe.ident,
+ )
+
def fetch_file_pdftotext(self, fe: FileEntity, release_ident: str) -> Optional[Any]:
"""
raw_text: str
release_ident: Optional[str]
file_ident: Optional[str]
"""
- # HACK: look for local pdftotext output
- if self.fulltext_cache_dir:
- local_txt_path = (
- f"{self.fulltext_cache_dir}/pdftotext/{fe.sha1[:2]}/{fe.sha1}.txt"
+ if not fe.sha1:
+ return None
+ if not fe.urls:
+ return None
+ try:
+ raw_text = self.sandcrawler_s3_client.get_blob(
+ bucket="sandcrawler",
+ prefix="",
+ folder="text",
+ sha1hex=fe.sha1,
+ extension=".txt",
)
- try:
- with open(local_txt_path, "r") as txt_file:
- raw_text = txt_file.read()
- return dict(
- raw_text=raw_text, release_ident=release_ident, file_ident=fe.ident,
- )
- except FileNotFoundError:
- pass
- except UnicodeDecodeError:
- pass
- return None
+ # print(raw_text)
+ except minio.error.NoSuchKey:
+ return None
+ return dict(
+ raw_text=raw_text, release_ident=release_ident, file_ident=fe.ident,
+ )
def lookup_sim(self, release: ReleaseEntity) -> Optional[SimIssueRow]:
"""
@@ -250,6 +272,7 @@ class WorkPipeline:
# find best accessible fatcat file
grobid_fulltext: Optional[Any] = None
+ pdf_meta: Optional[Any] = None
pdftotext_fulltext: Optional[Any] = None
for ident in pref_idents:
release = release_dict[ident]
@@ -259,7 +282,10 @@ class WorkPipeline:
if not fe.sha1 or fe.mimetype not in (None, "application/pdf"):
continue
grobid_fulltext = self.fetch_file_grobid(fe, ident)
- pdftotext_fulltext = self.fetch_file_pdftotext(fe, ident)
+ pdf_meta = self.fetch_pdf_meta(fe, ident)
+ pdftotext_fulltext = None
+ if pdf_meta:
+ pdftotext_fulltext = self.fetch_file_pdftotext(fe, ident)
if grobid_fulltext or pdftotext_fulltext:
break
if grobid_fulltext or pdftotext_fulltext:
@@ -301,6 +327,7 @@ class WorkPipeline:
biblio_release_ident=pref_idents[0],
grobid_fulltext=grobid_fulltext,
pdftotext_fulltext=pdftotext_fulltext,
+ pdf_meta=pdf_meta,
sim_fulltext=sim_fulltext,
)