aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/html.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2019-11-14 00:30:08 -0800
committerBryan Newbold <bnewbold@archive.org>2019-11-14 00:30:08 -0800
commit18473bd57f9255ba2cd7fe9a75881abf601df7b1 (patch)
tree276b13c80e43e3fe32eac0c6465fa59732afed05 /python/sandcrawler/html.py
parent31672b40c358b6dfbf29520838c8064ed2891cee (diff)
downloadsandcrawler-18473bd57f9255ba2cd7fe9a75881abf601df7b1.tar.gz
sandcrawler-18473bd57f9255ba2cd7fe9a75881abf601df7b1.zip
start of hrmars.com ingest support
Diffstat (limited to 'python/sandcrawler/html.py')
-rw-r--r--python/sandcrawler/html.py2
1 files changed, 2 insertions, 0 deletions
diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py
index 7e1e10d..2117eb0 100644
--- a/python/sandcrawler/html.py
+++ b/python/sandcrawler/html.py
@@ -91,4 +91,6 @@ def extract_fulltext_url(html_url, html_body):
if iframe and '.pdf' in iframe['src']:
return dict(pdf_url=iframe['src'])
+ # TODO: hrmars.com. anchor with .pdf href, and anchor text is "PDF"
+
return dict()