From a87ca1de1d8b31c4fbf9fddead27cdc58b09565a Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 3 Nov 2020 23:37:50 -0800 Subject: initial implementation of HTML ingest in existing worker --- python/sandcrawler/html_ingest.py | 29 ++++++++++++++----- python/sandcrawler/html_metadata.py | 5 ++++ python/sandcrawler/ingest.py | 55 +++++++++++++++++++++++++++++++++---- python/sandcrawler/persist.py | 20 ++++++++++++-- 4 files changed, 94 insertions(+), 15 deletions(-) (limited to 'python') diff --git a/python/sandcrawler/html_ingest.py b/python/sandcrawler/html_ingest.py index fe883ba..11909e6 100644 --- a/python/sandcrawler/html_ingest.py +++ b/python/sandcrawler/html_ingest.py @@ -76,21 +76,36 @@ class IngestWebResult(pydantic.BaseModel): datetime.datetime: lambda dt: dt.isoformat(), } +class HtmlMetaRow(pydantic.BaseModel): + sha1hex: str + status: str + scope: Optional[str] + has_teixml: bool + has_thumbnail: bool + word_count: Optional[int] + biblio: Optional[dict] + resources: Optional[List[dict]] + + class Config: + arbitrary_types_allowed = True + json_encoders = { + datetime.datetime: lambda dt: dt.isoformat(), + } + def to_sql_tuple(self) -> Tuple: """ This is for the html_meta SQL table. """ - assert self.file_meta and "sha1hex" in self.file_meta return ( - self.file_meta["sha1hex"], + self.sha1hex, datetime.datetime.now(), # updated self.status, self.scope, - bool(self.html_body and self.html_body['status'] == 'success' and self.html_body['tei_xml']), - False, # has_thumbnail - (self.html_body and self.html_body.get('word_count')) or None, - self.html_biblio, - self.html_resources, + self.has_teixml, + self.has_thumbnail, + self.word_count, + self.biblio and json.dumps(self.biblio, sort_keys=True), + self.resources and json.dumps(self.resources, sort_keys=True), ) diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py index 3ebba57..8928978 100644 --- a/python/sandcrawler/html_metadata.py +++ b/python/sandcrawler/html_metadata.py @@ -248,6 +248,11 @@ class BiblioMetadata(pydantic.BaseModel): html_fulltext_url: Optional[str] xml_fulltext_url: Optional[str] + class Config: + json_encoders = { + datetime.date: lambda dt: dt.isoformat() + } + def html_extract_fulltext_url(doc_url: str, doc: HTMLParser, patterns: List[dict]) -> Optional[Tuple[str, str]]: """ diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index 2e227bf..cc64fa5 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -5,7 +5,7 @@ import gzip import time import base64 import requests -from typing import Optional, Tuple, Any, Dict +from typing import Optional, Tuple, Any, Dict, List from http.server import BaseHTTPRequestHandler, HTTPServer from collections import namedtuple from selectolax.parser import HTMLParser @@ -13,9 +13,14 @@ from selectolax.parser import HTMLParser from sandcrawler.ia import SavePageNowClient, CdxApiClient, WaybackClient, WaybackError, WaybackContentError, SavePageNowError, CdxApiError, PetaboxError, cdx_to_dict, ResourceResult, fix_transfer_encoding from sandcrawler.grobid import GrobidClient from sandcrawler.pdfextract import process_pdf, PdfExtractResult -from sandcrawler.misc import gen_file_metadata, clean_url +from sandcrawler.misc import gen_file_metadata, clean_url, parse_cdx_datetime from sandcrawler.html import extract_fulltext_url -from sandcrawler.html_metadata import html_extract_fulltext_url, XML_FULLTEXT_PATTERNS +from sandcrawler.html_ingest import fetch_html_resources, \ + quick_fetch_html_resources, html_guess_scope, html_extract_body_teixml, \ + WebResource +from sandcrawler.html_metadata import html_extract_fulltext_url, \ + XML_FULLTEXT_PATTERNS, BiblioMetadata, html_extract_resources, \ + html_extract_biblio, load_adblock_rules from sandcrawler.workers import SandcrawlerWorker from sandcrawler.db import SandcrawlerPostgrestClient from sandcrawler.xml import xml_reserialize @@ -75,6 +80,8 @@ class IngestFileWorker(SandcrawlerWorker): self.try_existing_pdfextract = kwargs.get('try_existing_pdfextract', True) self.try_wayback = kwargs.get('try_wayback', True) self.try_spn2 = kwargs.get('try_spn2', True) + self.html_quick_mode = False + self.adblock_rules = load_adblock_rules() self.base_url_blocklist = [ # robot blocking @@ -247,6 +254,8 @@ class IngestFileWorker(SandcrawlerWorker): return { 'xml_meta': self.process_xml(resource, file_meta), } + elif ingest_type == "html": + return self.process_html(resource, file_meta) else: raise NotImplementedError(f"process {ingest_type} hit") @@ -326,6 +335,33 @@ class IngestFileWorker(SandcrawlerWorker): self.xmldoc_sink.push_record(msg, key=file_meta['sha1hex']) return dict(status="success") + def process_html(self, resource: ResourceResult, file_meta: dict) -> dict: + + html_doc = HTMLParser(resource.body) + html_biblio = html_extract_biblio(resource.terminal_url, html_doc) + html_body = html_extract_body_teixml(resource.body) + html_scope = html_guess_scope(resource.terminal_url, html_doc, html_biblio, html_body.get('tei_xml')) + + assert html_biblio + + raw_resources = html_extract_resources(resource.terminal_url, html_doc, self.adblock_rules) + assert len(raw_resources) <= 200 + + when = parse_cdx_datetime(resource.cdx.datetime) + + full_resources: List[WebResource] = [] + if self.html_quick_mode: + full_resources = quick_fetch_html_resources(raw_resources, self.wayback_client.cdx_client, when) + else: + full_resources = fetch_html_resources(raw_resources, self.wayback_client, when) + + return dict( + html_body=html_body, + html_biblio=json.loads(html_biblio.json(exclude_none=True)), + scope=html_scope, + html_resources=[json.loads(r.json(exclude_none=True)) for r in full_resources], + ) + def timeout_response(self, task: dict) -> dict: print("[TIMEOUT]", file=sys.stderr) return dict( @@ -336,7 +372,7 @@ class IngestFileWorker(SandcrawlerWorker): ) def want(self, request: dict) -> bool: - if not request.get('ingest_type') in ('file', 'pdf', 'xml'): + if not request.get('ingest_type') in ('file', 'pdf', 'xml', 'html'): return False return True @@ -347,7 +383,7 @@ class IngestFileWorker(SandcrawlerWorker): request['ingest_type'] = 'pdf' ingest_type = request.get('ingest_type') - if ingest_type not in ("pdf", "xml"): + if ingest_type not in ("pdf", "xml", "html"): raise NotImplementedError(f"can't handle ingest_type={ingest_type}") # parse/clean URL @@ -541,12 +577,21 @@ class IngestFileWorker(SandcrawlerWorker): if file_meta['mimetype'] not in ("application/xml", "text/xml", "application/jats+xml"): result['status'] = "wrong-mimetype" return result + elif ingest_type == "html": + if file_meta['mimetype'] not in ("text/html",): + result['status'] = "wrong-mimetype" + return result else: raise NotImplementedError() info = self.process_hit(ingest_type, resource, file_meta) result.update(info) + # scope is getting calculated in process_hit() + if result.get('scope') and result['scope'] not in ('article-fulltext', 'unknown'): + result['status'] = "wrong-scope" + return result + result['status'] = "success" result['hit'] = True if ingest_type == "pdf": diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py index c225d5a..033bc91 100644 --- a/python/sandcrawler/persist.py +++ b/python/sandcrawler/persist.py @@ -28,6 +28,7 @@ from sandcrawler.db import SandcrawlerPostgresClient from sandcrawler.minio import SandcrawlerMinioClient from sandcrawler.grobid import GrobidClient from sandcrawler.pdfextract import PdfExtractResult +from sandcrawler.html_ingest import HtmlMetaRow class PersistCdxWorker(SandcrawlerWorker): @@ -159,8 +160,21 @@ class PersistIngestFileResultWorker(SandcrawlerWorker): result['terminal_sha1hex'] = terminal.get('terminal_sha1hex') return result - def result_to_html_meta(self, record: dict) -> Optional[dict]: - raise NotImplementedError() + def result_to_html_meta(self, record: dict) -> Optional[HtmlMetaRow]: + html_body = record.get('html_body') + file_meta = record.get('file_meta') + if not (file_meta and html_body): + return None + return HtmlMetaRow( + sha1hex=file_meta["sha1hex"], + status=record.get('status'), + scope=record.get('scope'), + has_teixml=bool(html_body and html_body['status'] == 'success' and html_body.get('tei_xml')), + has_thumbnail=False, # TODO + word_count=(html_body and html_body.get('word_count')) or None, + biblio=record.get('html_biblio'), + resources=record.get('html_resources'), + ) def push_batch(self, batch): self.counts['total'] += len(batch) @@ -200,7 +214,7 @@ class PersistIngestFileResultWorker(SandcrawlerWorker): self.counts['insert-file_meta'] += resp[0] self.counts['update-file_meta'] += resp[1] - html_meta_batch = [self.result_to_html_meta(r) for r in batch if r.get('hit') and r.get('html_meta')] + html_meta_batch = [self.result_to_html_meta(r) for r in batch if r.get('hit') and r.get('html_body')] if html_meta_batch: resp = self.db.insert_html_meta(self.cur, html_meta_batch, on_conflict="nothing") self.counts['insert-html_meta'] += resp[0] -- cgit v1.2.3