From 18473bd57f9255ba2cd7fe9a75881abf601df7b1 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 14 Nov 2019 00:30:08 -0800 Subject: start of hrmars.com ingest support --- python/sandcrawler/html.py | 2 ++ 1 file changed, 2 insertions(+) (limited to 'python/sandcrawler/html.py') diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py index 7e1e10d..2117eb0 100644 --- a/python/sandcrawler/html.py +++ b/python/sandcrawler/html.py @@ -91,4 +91,6 @@ def extract_fulltext_url(html_url, html_body): if iframe and '.pdf' in iframe['src']: return dict(pdf_url=iframe['src']) + # TODO: hrmars.com. anchor with .pdf href, and anchor text is "PDF" + return dict() -- cgit v1.2.3