summaryrefslogtreecommitdiffstats
path: root/fatcat_scholar/work_pipeline.py
diff options
context:
space:
mode:
Diffstat (limited to 'fatcat_scholar/work_pipeline.py')
-rw-r--r--fatcat_scholar/work_pipeline.py48
1 files changed, 46 insertions, 2 deletions
diff --git a/fatcat_scholar/work_pipeline.py b/fatcat_scholar/work_pipeline.py
index 631bda8..ea33a01 100644
--- a/fatcat_scholar/work_pipeline.py
+++ b/fatcat_scholar/work_pipeline.py
@@ -7,7 +7,7 @@ from typing import List, Dict, Tuple, Optional, Any, Sequence
import minio
import requests
import internetarchive
-from fatcat_openapi_client import ReleaseEntity, FileEntity
+from fatcat_openapi_client import ReleaseEntity, FileEntity, WebcaptureEntity
from fatcat_scholar.api_entities import *
from fatcat_scholar.config import settings
@@ -169,6 +169,41 @@ class WorkPipeline:
raw_text=raw_text, release_ident=release_ident, file_ident=fe.ident,
)
+ def fetch_webcapture_html_fulltext(
+ self, wc: WebcaptureEntity, release_ident: str,
+ ) -> Optional[Dict[str, Any]]:
+
+ primary_resources = [cdx for cdx in wc.cdx if cdx.url == wc.original_url]
+ if not primary_resources or primary_resources[0].mimetype != "text/html":
+ return None
+ html_meta = self.sandcrawler_db_client.get_html_meta(primary_resources[0].sha1)
+ if not html_meta:
+ return None
+ sha1hex = html_meta.get("sha1hex")
+ if not sha1hex:
+ return None
+ if html_meta.get("status") != "success" or not html_meta.get("has_teixml"):
+ return None
+
+ try:
+ tei_xml = self.sandcrawler_s3_client.get_blob(
+ bucket="sandcrawler",
+ prefix="",
+ folder="html_body",
+ sha1hex=sha1hex,
+ extension=".tei.xml",
+ )
+ # print(grobid_xml)
+ except minio.error.NoSuchKey:
+ return None
+
+ return dict(
+ html_meta=html_meta,
+ tei_xml=tei_xml,
+ release_ident=release_ident,
+ webcapture_ident=wc.ident,
+ )
+
def lookup_sim(self, release: ReleaseEntity) -> Optional[SimIssueRow]:
"""
Checks in IssueDB to see if this release is likely to have a copy in a
@@ -279,6 +314,7 @@ class WorkPipeline:
grobid_fulltext: Optional[Any] = None
pdf_meta: Optional[Any] = None
pdftotext_fulltext: Optional[Any] = None
+ html_fulltext: Optional[Any] = None
for ident in pref_idents:
release = release_dict[ident]
if not release.files:
@@ -295,7 +331,14 @@ class WorkPipeline:
pdftotext_fulltext = self.fetch_file_pdftotext(fe, ident)
if grobid_fulltext or pdftotext_fulltext:
break
- if grobid_fulltext or pdftotext_fulltext:
+ pdf_meta = None
+ for wc in release.webcaptures:
+ # find primary web capture object
+ html_fulltext = self.fetch_webcapture_html_fulltext(wc, ident)
+ if html_fulltext and html_fulltext.get("tei_xml"):
+ break
+ html_fulltext = None
+ if grobid_fulltext or pdftotext_fulltext or html_fulltext:
break
# find best accessible SIM metadata and fulltext
@@ -335,6 +378,7 @@ class WorkPipeline:
grobid_fulltext=grobid_fulltext,
pdftotext_fulltext=pdftotext_fulltext,
pdf_meta=pdf_meta,
+ html_fulltext=html_fulltext,
sim_fulltext=sim_fulltext,
)