diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-11-06 19:51:00 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-11-18 11:58:26 -0800 |
commit | 4979c58ee91903148962f4d62d1a8d423349ad67 (patch) | |
tree | 09901de13601d058eb413614a0dc626e1e30f4d2 | |
parent | 7d38f46fc1970decfcfb1e3f4583b85605e5b8ee (diff) | |
download | fatcat-scholar-4979c58ee91903148962f4d62d1a8d423349ad67.tar.gz fatcat-scholar-4979c58ee91903148962f4d62d1a8d423349ad67.zip |
add basic html fulltext support to fetch pipeline
-rw-r--r-- | fatcat_scholar/sandcrawler.py | 11 | ||||
-rw-r--r-- | fatcat_scholar/schema.py | 1 | ||||
-rw-r--r-- | fatcat_scholar/work_pipeline.py | 48 |
3 files changed, 58 insertions, 2 deletions
diff --git a/fatcat_scholar/sandcrawler.py b/fatcat_scholar/sandcrawler.py index 25c7002..416ed83 100644 --- a/fatcat_scholar/sandcrawler.py +++ b/fatcat_scholar/sandcrawler.py @@ -27,6 +27,17 @@ class SandcrawlerPostgrestClient: else: return None + def get_html_meta(self, sha1: str) -> Optional[Dict[str, Any]]: + resp = requests.get( + self.api_url + "/html_meta", params=dict(sha1hex="eq." + sha1) + ) + resp.raise_for_status() + resp_json = resp.json() + if resp_json: + return resp_json[0] + else: + return None + class SandcrawlerMinioClient(object): def __init__( diff --git a/fatcat_scholar/schema.py b/fatcat_scholar/schema.py index 5637e0a..bec81ab 100644 --- a/fatcat_scholar/schema.py +++ b/fatcat_scholar/schema.py @@ -33,6 +33,7 @@ class IntermediateBundle(BaseModel): grobid_fulltext: Optional[Dict[str, Any]] pdftotext_fulltext: Optional[Dict[str, Any]] pdf_meta: Optional[Dict[str, Any]] + html_fulltext: Optional[Dict[str, Any]] sim_fulltext: Optional[Dict[str, Any]] fetched: Optional[datetime.datetime] diff --git a/fatcat_scholar/work_pipeline.py b/fatcat_scholar/work_pipeline.py index 631bda8..ea33a01 100644 --- a/fatcat_scholar/work_pipeline.py +++ b/fatcat_scholar/work_pipeline.py @@ -7,7 +7,7 @@ from typing import List, Dict, Tuple, Optional, Any, Sequence import minio import requests import internetarchive -from fatcat_openapi_client import ReleaseEntity, FileEntity +from fatcat_openapi_client import ReleaseEntity, FileEntity, WebcaptureEntity from fatcat_scholar.api_entities import * from fatcat_scholar.config import settings @@ -169,6 +169,41 @@ class WorkPipeline: raw_text=raw_text, release_ident=release_ident, file_ident=fe.ident, ) + def fetch_webcapture_html_fulltext( + self, wc: WebcaptureEntity, release_ident: str, + ) -> Optional[Dict[str, Any]]: + + primary_resources = [cdx for cdx in wc.cdx if cdx.url == wc.original_url] + if not primary_resources or primary_resources[0].mimetype != "text/html": + return None + html_meta = self.sandcrawler_db_client.get_html_meta(primary_resources[0].sha1) + if not html_meta: + return None + sha1hex = html_meta.get("sha1hex") + if not sha1hex: + return None + if html_meta.get("status") != "success" or not html_meta.get("has_teixml"): + return None + + try: + tei_xml = self.sandcrawler_s3_client.get_blob( + bucket="sandcrawler", + prefix="", + folder="html_body", + sha1hex=sha1hex, + extension=".tei.xml", + ) + # print(grobid_xml) + except minio.error.NoSuchKey: + return None + + return dict( + html_meta=html_meta, + tei_xml=tei_xml, + release_ident=release_ident, + webcapture_ident=wc.ident, + ) + def lookup_sim(self, release: ReleaseEntity) -> Optional[SimIssueRow]: """ Checks in IssueDB to see if this release is likely to have a copy in a @@ -279,6 +314,7 @@ class WorkPipeline: grobid_fulltext: Optional[Any] = None pdf_meta: Optional[Any] = None pdftotext_fulltext: Optional[Any] = None + html_fulltext: Optional[Any] = None for ident in pref_idents: release = release_dict[ident] if not release.files: @@ -295,7 +331,14 @@ class WorkPipeline: pdftotext_fulltext = self.fetch_file_pdftotext(fe, ident) if grobid_fulltext or pdftotext_fulltext: break - if grobid_fulltext or pdftotext_fulltext: + pdf_meta = None + for wc in release.webcaptures: + # find primary web capture object + html_fulltext = self.fetch_webcapture_html_fulltext(wc, ident) + if html_fulltext and html_fulltext.get("tei_xml"): + break + html_fulltext = None + if grobid_fulltext or pdftotext_fulltext or html_fulltext: break # find best accessible SIM metadata and fulltext @@ -335,6 +378,7 @@ class WorkPipeline: grobid_fulltext=grobid_fulltext, pdftotext_fulltext=pdftotext_fulltext, pdf_meta=pdf_meta, + html_fulltext=html_fulltext, sim_fulltext=sim_fulltext, ) |