summaryrefslogtreecommitdiffstats
path: root/fatcat_scholar
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-11-06 19:51:00 -0800
committerBryan Newbold <bnewbold@archive.org>2020-11-18 11:58:26 -0800
commit4979c58ee91903148962f4d62d1a8d423349ad67 (patch)
tree09901de13601d058eb413614a0dc626e1e30f4d2 /fatcat_scholar
parent7d38f46fc1970decfcfb1e3f4583b85605e5b8ee (diff)
downloadfatcat-scholar-4979c58ee91903148962f4d62d1a8d423349ad67.tar.gz
fatcat-scholar-4979c58ee91903148962f4d62d1a8d423349ad67.zip
add basic html fulltext support to fetch pipeline
Diffstat (limited to 'fatcat_scholar')
-rw-r--r--fatcat_scholar/sandcrawler.py11
-rw-r--r--fatcat_scholar/schema.py1
-rw-r--r--fatcat_scholar/work_pipeline.py48
3 files changed, 58 insertions, 2 deletions
diff --git a/fatcat_scholar/sandcrawler.py b/fatcat_scholar/sandcrawler.py
index 25c7002..416ed83 100644
--- a/fatcat_scholar/sandcrawler.py
+++ b/fatcat_scholar/sandcrawler.py
@@ -27,6 +27,17 @@ class SandcrawlerPostgrestClient:
else:
return None
+ def get_html_meta(self, sha1: str) -> Optional[Dict[str, Any]]:
+ resp = requests.get(
+ self.api_url + "/html_meta", params=dict(sha1hex="eq." + sha1)
+ )
+ resp.raise_for_status()
+ resp_json = resp.json()
+ if resp_json:
+ return resp_json[0]
+ else:
+ return None
+
class SandcrawlerMinioClient(object):
def __init__(
diff --git a/fatcat_scholar/schema.py b/fatcat_scholar/schema.py
index 5637e0a..bec81ab 100644
--- a/fatcat_scholar/schema.py
+++ b/fatcat_scholar/schema.py
@@ -33,6 +33,7 @@ class IntermediateBundle(BaseModel):
grobid_fulltext: Optional[Dict[str, Any]]
pdftotext_fulltext: Optional[Dict[str, Any]]
pdf_meta: Optional[Dict[str, Any]]
+ html_fulltext: Optional[Dict[str, Any]]
sim_fulltext: Optional[Dict[str, Any]]
fetched: Optional[datetime.datetime]
diff --git a/fatcat_scholar/work_pipeline.py b/fatcat_scholar/work_pipeline.py
index 631bda8..ea33a01 100644
--- a/fatcat_scholar/work_pipeline.py
+++ b/fatcat_scholar/work_pipeline.py
@@ -7,7 +7,7 @@ from typing import List, Dict, Tuple, Optional, Any, Sequence
import minio
import requests
import internetarchive
-from fatcat_openapi_client import ReleaseEntity, FileEntity
+from fatcat_openapi_client import ReleaseEntity, FileEntity, WebcaptureEntity
from fatcat_scholar.api_entities import *
from fatcat_scholar.config import settings
@@ -169,6 +169,41 @@ class WorkPipeline:
raw_text=raw_text, release_ident=release_ident, file_ident=fe.ident,
)
+ def fetch_webcapture_html_fulltext(
+ self, wc: WebcaptureEntity, release_ident: str,
+ ) -> Optional[Dict[str, Any]]:
+
+ primary_resources = [cdx for cdx in wc.cdx if cdx.url == wc.original_url]
+ if not primary_resources or primary_resources[0].mimetype != "text/html":
+ return None
+ html_meta = self.sandcrawler_db_client.get_html_meta(primary_resources[0].sha1)
+ if not html_meta:
+ return None
+ sha1hex = html_meta.get("sha1hex")
+ if not sha1hex:
+ return None
+ if html_meta.get("status") != "success" or not html_meta.get("has_teixml"):
+ return None
+
+ try:
+ tei_xml = self.sandcrawler_s3_client.get_blob(
+ bucket="sandcrawler",
+ prefix="",
+ folder="html_body",
+ sha1hex=sha1hex,
+ extension=".tei.xml",
+ )
+ # print(grobid_xml)
+ except minio.error.NoSuchKey:
+ return None
+
+ return dict(
+ html_meta=html_meta,
+ tei_xml=tei_xml,
+ release_ident=release_ident,
+ webcapture_ident=wc.ident,
+ )
+
def lookup_sim(self, release: ReleaseEntity) -> Optional[SimIssueRow]:
"""
Checks in IssueDB to see if this release is likely to have a copy in a
@@ -279,6 +314,7 @@ class WorkPipeline:
grobid_fulltext: Optional[Any] = None
pdf_meta: Optional[Any] = None
pdftotext_fulltext: Optional[Any] = None
+ html_fulltext: Optional[Any] = None
for ident in pref_idents:
release = release_dict[ident]
if not release.files:
@@ -295,7 +331,14 @@ class WorkPipeline:
pdftotext_fulltext = self.fetch_file_pdftotext(fe, ident)
if grobid_fulltext or pdftotext_fulltext:
break
- if grobid_fulltext or pdftotext_fulltext:
+ pdf_meta = None
+ for wc in release.webcaptures:
+ # find primary web capture object
+ html_fulltext = self.fetch_webcapture_html_fulltext(wc, ident)
+ if html_fulltext and html_fulltext.get("tei_xml"):
+ break
+ html_fulltext = None
+ if grobid_fulltext or pdftotext_fulltext or html_fulltext:
break
# find best accessible SIM metadata and fulltext
@@ -335,6 +378,7 @@ class WorkPipeline:
grobid_fulltext=grobid_fulltext,
pdftotext_fulltext=pdftotext_fulltext,
pdf_meta=pdf_meta,
+ html_fulltext=html_fulltext,
sim_fulltext=sim_fulltext,
)