From 4979c58ee91903148962f4d62d1a8d423349ad67 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 6 Nov 2020 19:51:00 -0800 Subject: add basic html fulltext support to fetch pipeline --- fatcat_scholar/sandcrawler.py | 11 ++++++++++ fatcat_scholar/schema.py | 1 + fatcat_scholar/work_pipeline.py | 48 +++++++++++++++++++++++++++++++++++++++-- 3 files changed, 58 insertions(+), 2 deletions(-) (limited to 'fatcat_scholar') diff --git a/fatcat_scholar/sandcrawler.py b/fatcat_scholar/sandcrawler.py index 25c7002..416ed83 100644 --- a/fatcat_scholar/sandcrawler.py +++ b/fatcat_scholar/sandcrawler.py @@ -27,6 +27,17 @@ class SandcrawlerPostgrestClient: else: return None + def get_html_meta(self, sha1: str) -> Optional[Dict[str, Any]]: + resp = requests.get( + self.api_url + "/html_meta", params=dict(sha1hex="eq." + sha1) + ) + resp.raise_for_status() + resp_json = resp.json() + if resp_json: + return resp_json[0] + else: + return None + class SandcrawlerMinioClient(object): def __init__( diff --git a/fatcat_scholar/schema.py b/fatcat_scholar/schema.py index 5637e0a..bec81ab 100644 --- a/fatcat_scholar/schema.py +++ b/fatcat_scholar/schema.py @@ -33,6 +33,7 @@ class IntermediateBundle(BaseModel): grobid_fulltext: Optional[Dict[str, Any]] pdftotext_fulltext: Optional[Dict[str, Any]] pdf_meta: Optional[Dict[str, Any]] + html_fulltext: Optional[Dict[str, Any]] sim_fulltext: Optional[Dict[str, Any]] fetched: Optional[datetime.datetime] diff --git a/fatcat_scholar/work_pipeline.py b/fatcat_scholar/work_pipeline.py index 631bda8..ea33a01 100644 --- a/fatcat_scholar/work_pipeline.py +++ b/fatcat_scholar/work_pipeline.py @@ -7,7 +7,7 @@ from typing import List, Dict, Tuple, Optional, Any, Sequence import minio import requests import internetarchive -from fatcat_openapi_client import ReleaseEntity, FileEntity +from fatcat_openapi_client import ReleaseEntity, FileEntity, WebcaptureEntity from fatcat_scholar.api_entities import * from fatcat_scholar.config import settings @@ -169,6 +169,41 @@ class WorkPipeline: raw_text=raw_text, release_ident=release_ident, file_ident=fe.ident, ) + def fetch_webcapture_html_fulltext( + self, wc: WebcaptureEntity, release_ident: str, + ) -> Optional[Dict[str, Any]]: + + primary_resources = [cdx for cdx in wc.cdx if cdx.url == wc.original_url] + if not primary_resources or primary_resources[0].mimetype != "text/html": + return None + html_meta = self.sandcrawler_db_client.get_html_meta(primary_resources[0].sha1) + if not html_meta: + return None + sha1hex = html_meta.get("sha1hex") + if not sha1hex: + return None + if html_meta.get("status") != "success" or not html_meta.get("has_teixml"): + return None + + try: + tei_xml = self.sandcrawler_s3_client.get_blob( + bucket="sandcrawler", + prefix="", + folder="html_body", + sha1hex=sha1hex, + extension=".tei.xml", + ) + # print(grobid_xml) + except minio.error.NoSuchKey: + return None + + return dict( + html_meta=html_meta, + tei_xml=tei_xml, + release_ident=release_ident, + webcapture_ident=wc.ident, + ) + def lookup_sim(self, release: ReleaseEntity) -> Optional[SimIssueRow]: """ Checks in IssueDB to see if this release is likely to have a copy in a @@ -279,6 +314,7 @@ class WorkPipeline: grobid_fulltext: Optional[Any] = None pdf_meta: Optional[Any] = None pdftotext_fulltext: Optional[Any] = None + html_fulltext: Optional[Any] = None for ident in pref_idents: release = release_dict[ident] if not release.files: @@ -295,7 +331,14 @@ class WorkPipeline: pdftotext_fulltext = self.fetch_file_pdftotext(fe, ident) if grobid_fulltext or pdftotext_fulltext: break - if grobid_fulltext or pdftotext_fulltext: + pdf_meta = None + for wc in release.webcaptures: + # find primary web capture object + html_fulltext = self.fetch_webcapture_html_fulltext(wc, ident) + if html_fulltext and html_fulltext.get("tei_xml"): + break + html_fulltext = None + if grobid_fulltext or pdftotext_fulltext or html_fulltext: break # find best accessible SIM metadata and fulltext @@ -335,6 +378,7 @@ class WorkPipeline: grobid_fulltext=grobid_fulltext, pdftotext_fulltext=pdftotext_fulltext, pdf_meta=pdf_meta, + html_fulltext=html_fulltext, sim_fulltext=sim_fulltext, ) -- cgit v1.2.3