From 8958b12ff12c59f1c1f7267a509a99bfaa14c7d7 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 6 Nov 2020 18:17:49 -0800 Subject: html: pdf and html extract similar to XML Note that the primary PDF URL extraction path is a separate code path. --- python/sandcrawler/html_metadata.py | 50 ++++++++++++++++++++++--------------- python/sandcrawler/ingest.py | 27 ++++++++++++++++++-- 2 files changed, 55 insertions(+), 22 deletions(-) (limited to 'python') diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py index 8928978..0d14166 100644 --- a/python/sandcrawler/html_metadata.py +++ b/python/sandcrawler/html_metadata.py @@ -154,14 +154,6 @@ HEAD_META_PATTERNS: Any = { "meta[name='dc.language']", "meta[name='og:locale']", ], - "html_fulltext_url": [ - "meta[name='citation_fulltext_html_url']", - "meta[name='bepress_citation_fulltext_html_url']", - ], - "pdf_fulltext_url": [ - "meta[name='citation_pdf_url']", - "meta[name='bepress_citation_pdf_url']", - ], } HEAD_META_LIST_PATTERNS: Any = { @@ -205,6 +197,27 @@ XML_FULLTEXT_PATTERNS: List[dict] = [ }, ] +HTML_FULLTEXT_PATTERNS: List[dict] = [ + { + "selector": "meta[name='citation_fulltext_html_url']", + "attr": "content", + "technique": "citation_fulltext_html_url", + }, +] + +PDF_FULLTEXT_PATTERNS: List[dict] = [ + { + "selector": "meta[name='citation_pdf_url']", + "attr": "content", + "technique": "citation_pdf_url", + }, + { + "selector": "meta[name='bepress_citation_pdf_url']", + "attr": "content", + "technique": "citation_pdf_url", + }, +] + RELEASE_TYPE_MAP = { "research article": "article-journal", "text.serial.journal": "article-journal", @@ -308,9 +321,15 @@ def html_extract_biblio(doc_url: str, doc: HTMLParser) -> Optional[BiblioMetadat break # (some) fulltext extractions + pdf_fulltext_url = html_extract_fulltext_url(doc_url, doc, PDF_FULLTEXT_PATTERNS) + if pdf_fulltext_url: + meta['pdf_fulltext_url'] = pdf_fulltext_url[0] xml_fulltext_url = html_extract_fulltext_url(doc_url, doc, XML_FULLTEXT_PATTERNS) if xml_fulltext_url: meta['xml_fulltext_url'] = xml_fulltext_url[0] + html_fulltext_url = html_extract_fulltext_url(doc_url, doc, HTML_FULLTEXT_PATTERNS) + if html_fulltext_url: + meta['html_fulltext_url'] = html_fulltext_url[0] # TODO: replace with clean_doi() et al if meta.get('doi') and meta.get('doi').startswith('doi:'): @@ -340,24 +359,12 @@ def html_extract_biblio(doc_url: str, doc: HTMLParser) -> Optional[BiblioMetadat if release_type: meta['release_type'] = release_type - # resolve relative URLs - for key in ('pdf_fulltext_url', 'html_fulltext_url'): - if meta.get(key): - meta[key] = urllib.parse.urljoin(doc_url, meta[key]) - return BiblioMetadata(**meta) def load_adblock_rules() -> braveblock.Adblocker: """ TODO: consider blocking very generic assets: - - - favicon.ico - ://fonts.googleapis.com/css* - - ://widgets.figshare.com/* - - ://crossmark-cdn.crossref.org/widget/* - - ://code.jquery.com/* - => hrm - - ://platform.twitter.com/widgets.js - ://journals.plos.org/plosone/resource/img/icon.* """ return braveblock.Adblocker( @@ -384,6 +391,9 @@ def load_adblock_rules() -> braveblock.Adblocker: #"||ajax.googleapis.com^", #"||cdnjs.cloudflare.com^", + # badges, "share" buttons, etc + "apis.google.com/js/plusone", + # PLOS images "/resource/img/icon.*.16.png^", ], diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index 363dfb8..f696231 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -19,8 +19,8 @@ from sandcrawler.html_ingest import fetch_html_resources, \ quick_fetch_html_resources, html_guess_scope, html_extract_body_teixml, \ WebResource from sandcrawler.html_metadata import html_extract_fulltext_url, \ - XML_FULLTEXT_PATTERNS, BiblioMetadata, html_extract_resources, \ - html_extract_biblio, load_adblock_rules + XML_FULLTEXT_PATTERNS, HTML_FULLTEXT_PATTERNS, BiblioMetadata, \ + html_extract_resources, html_extract_biblio, load_adblock_rules from sandcrawler.workers import SandcrawlerWorker from sandcrawler.db import SandcrawlerPostgrestClient from sandcrawler.xml import xml_reserialize @@ -563,6 +563,29 @@ class IngestFileWorker(SandcrawlerWorker): next_url, ), file=sys.stderr) + if next_url in hops: + result['status'] = 'link-loop' + result['error_message'] = "repeated: {}".format(next_url) + return result + hops.append(next_url) + continue + elif ingest_type == "html" and html_ish_resource: + # parse with selectolax, extract XML fulltext URL + html_doc = HTMLParser(resource.body) + extract_next_hop = html_extract_fulltext_url(resource.terminal_url, html_doc, HTML_FULLTEXT_PATTERNS) + if extract_next_hop: + next_url = extract_next_hop[0] + technique = extract_next_hop[1] + if next_url in hops: + # for HTML ingest, we don't count this as a link-loop + break + print("[PARSE {:>6}] {} {}".format( + ingest_type, + technique, + next_url, + ), + file=sys.stderr) + hops.append(next_url) continue # default is to NOT keep hopping -- cgit v1.2.3