From 18473bd57f9255ba2cd7fe9a75881abf601df7b1 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 14 Nov 2019 00:30:08 -0800 Subject: start of hrmars.com ingest support --- python/sandcrawler/html.py | 2 ++ python/sandcrawler/ingest.py | 7 +++++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py index 7e1e10d..2117eb0 100644 --- a/python/sandcrawler/html.py +++ b/python/sandcrawler/html.py @@ -91,4 +91,6 @@ def extract_fulltext_url(html_url, html_body): if iframe and '.pdf' in iframe['src']: return dict(pdf_url=iframe['src']) + # TODO: hrmars.com. anchor with .pdf href, and anchor text is "PDF" + return dict() diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index 29bb78e..43aea1b 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -47,7 +47,7 @@ class IngestFileWorker(SandcrawlerWorker): if not cdx: # TODO: refactor this to make adding new domains/patterns easier # sciencedirect.com (Elsevier) requires browser crawling (SPNv2) - if ('sciencedirect.com' in url and '.pdf' in url) or ('osapublishing.org' in url) or ('pubs.acs.org/doi/' in url) or ('ieeexplore.ieee.org' in url and ('.pdf' in url or '/stamp/stamp.jsp' in url)): + if ('sciencedirect.com' in url and '.pdf' in url) or ('osapublishing.org' in url) or ('pubs.acs.org/doi/' in url) or ('ieeexplore.ieee.org' in url and ('.pdf' in url or '/stamp/stamp.jsp' in url)) or ('hrmars.com' in url): #print(url) cdx_list = self.spn_client.save_url_now_v2(url) for cdx_url in cdx_list: @@ -63,6 +63,9 @@ class IngestFileWorker(SandcrawlerWorker): if 'ieeexplore.ieee.org' in cdx_url and '.pdf' in cdx_url and 'arnumber=' in cdx_url: cdx = self.cdx_client.lookup_latest(cdx_url) break + if 'hrmars.com' in cdx_url and 'journals/papers' in cdx_url: + cdx = self.cdx_client.lookup_latest(cdx_url) + break if not cdx: # extraction didn't work as expected; fetch whatever SPN2 got cdx = self.cdx_client.lookup_latest(url, follow_redirects=True) @@ -126,7 +129,7 @@ class IngestFileWorker(SandcrawlerWorker): return response file_meta = gen_file_metadata(body) mimetype = cdx_dict['mimetype'] - if mimetype in ('warc/revisit', 'binary/octet-stream', 'application/octet-stream'): + if mimetype in ('warc/revisit', 'binary/octet-stream', 'application/octet-stream', 'application/x-download', 'application/force-download'): mimetype = file_meta['mimetype'] response['file_meta'] = file_meta if 'html' in mimetype: -- cgit v1.2.3