From 6431b2f6b4bd9bd4dea4b373b89eb3f89648cc4c Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Sun, 8 Nov 2020 14:28:37 -0800 Subject: html: small ingest improvements --- python/sandcrawler/html_ingest.py | 4 ++++ python/sandcrawler/html_metadata.py | 15 +++++++++++++++ 2 files changed, 19 insertions(+) diff --git a/python/sandcrawler/html_ingest.py b/python/sandcrawler/html_ingest.py index 42bd946..a8ba0d6 100644 --- a/python/sandcrawler/html_ingest.py +++ b/python/sandcrawler/html_ingest.py @@ -127,6 +127,10 @@ def quick_fetch_html_resources(resources: List[dict], cdx_client: CdxApiClient, raise NoCaptureError(f"HTML sub-resource not found: {resource['url']}") if cdx_row.url != resource['url']: print(f" WARN: CDX fuzzy match: {cdx_row.url} != {resource['url']}", file=sys.stderr) + if not cdx_row.status_code: + # TODO: fall back to a full fetch? + print(f" WARN: skipping revisit record", file=sys.stderr) + continue full.append(WebResource( surt=cdx_row.surt, timestamp=cdx_row.datetime, diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py index 0d14166..cd49a05 100644 --- a/python/sandcrawler/html_metadata.py +++ b/python/sandcrawler/html_metadata.py @@ -195,6 +195,13 @@ XML_FULLTEXT_PATTERNS: List[dict] = [ "attr": "href", "technique": "SciElo XML link", }, + { + "in_doc_url": "/article/view/", + "in_fulltext_url": "viewXML", + "selector": "a[class='obj_galley_link']", + "attr": "href", + "technique": "OJS Gallery XML link", + }, ] HTML_FULLTEXT_PATTERNS: List[dict] = [ @@ -203,6 +210,13 @@ HTML_FULLTEXT_PATTERNS: List[dict] = [ "attr": "content", "technique": "citation_fulltext_html_url", }, + { + "in_doc_url": "/article/view/", + "in_fulltext_url": "inline=1", + "selector": "iframe[name='htmlFrame']", + "attr": "src", + "technique": "OJS HTML iframe", + }, ] PDF_FULLTEXT_PATTERNS: List[dict] = [ @@ -393,6 +407,7 @@ def load_adblock_rules() -> braveblock.Adblocker: # badges, "share" buttons, etc "apis.google.com/js/plusone", + "www.google.com/recaptcha/", # PLOS images "/resource/img/icon.*.16.png^", -- cgit v1.2.3