start of hrmars.com ingest support

author: Bryan Newbold <bnewbold@archive.org> 2019-11-14 00:30:08 -0800
committer: Bryan Newbold <bnewbold@archive.org> 2019-11-14 00:30:08 -0800
commit: 18473bd57f9255ba2cd7fe9a75881abf601df7b1 (patch)
tree: 276b13c80e43e3fe32eac0c6465fa59732afed05
parent: 31672b40c358b6dfbf29520838c8064ed2891cee (diff)
download: sandcrawler-18473bd57f9255ba2cd7fe9a75881abf601df7b1.tar.gz
sandcrawler-18473bd57f9255ba2cd7fe9a75881abf601df7b1.zip
2 files changed, 7 insertions, 2 deletions
diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py
index 7e1e10d..2117eb0 100644
--- a/python/sandcrawler/html.py
+++ b/python/sandcrawler/html.py
@@ -91,4 +91,6 @@ def extract_fulltext_url(html_url, html_body):
         if iframe and '.pdf' in iframe['src']:
             return dict(pdf_url=iframe['src'])
 
+    # TODO: hrmars.com. anchor with .pdf href, and anchor text is "PDF"
+
     return dict()
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index 29bb78e..43aea1b 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -47,7 +47,7 @@ class IngestFileWorker(SandcrawlerWorker):
         if not cdx:
             # TODO: refactor this to make adding new domains/patterns easier
             # sciencedirect.com (Elsevier) requires browser crawling (SPNv2)
-            if ('sciencedirect.com' in url and '.pdf' in url) or ('osapublishing.org' in url) or ('pubs.acs.org/doi/' in url) or ('ieeexplore.ieee.org' in url and ('.pdf' in url or '/stamp/stamp.jsp' in url)):
+            if ('sciencedirect.com' in url and '.pdf' in url) or ('osapublishing.org' in url) or ('pubs.acs.org/doi/' in url) or ('ieeexplore.ieee.org' in url and ('.pdf' in url or '/stamp/stamp.jsp' in url)) or ('hrmars.com' in url):
                 #print(url)
                 cdx_list = self.spn_client.save_url_now_v2(url)
                 for cdx_url in cdx_list:
@@ -63,6 +63,9 @@ class IngestFileWorker(SandcrawlerWorker):
                     if 'ieeexplore.ieee.org' in cdx_url and '.pdf' in cdx_url and 'arnumber=' in cdx_url:
                         cdx = self.cdx_client.lookup_latest(cdx_url)
                         break
+                    if 'hrmars.com' in cdx_url and 'journals/papers' in cdx_url:
+                        cdx = self.cdx_client.lookup_latest(cdx_url)
+                        break
                 if not cdx:
                     # extraction didn't work as expected; fetch whatever SPN2 got
                     cdx = self.cdx_client.lookup_latest(url, follow_redirects=True)
@@ -126,7 +129,7 @@ class IngestFileWorker(SandcrawlerWorker):
                 return response
             file_meta = gen_file_metadata(body)
             mimetype = cdx_dict['mimetype']
-            if mimetype in ('warc/revisit', 'binary/octet-stream', 'application/octet-stream'):
+            if mimetype in ('warc/revisit', 'binary/octet-stream', 'application/octet-stream', 'application/x-download', 'application/force-download'):
                 mimetype = file_meta['mimetype']
                 response['file_meta'] = file_meta
             if 'html' in mimetype:
author	Bryan Newbold <bnewbold@archive.org>	2019-11-14 00:30:08 -0800
committer	Bryan Newbold <bnewbold@archive.org>	2019-11-14 00:30:08 -0800
commit	18473bd57f9255ba2cd7fe9a75881abf601df7b1 (patch)
tree	276b13c80e43e3fe32eac0c6465fa59732afed05
parent	31672b40c358b6dfbf29520838c8064ed2891cee (diff)
download	sandcrawler-18473bd57f9255ba2cd7fe9a75881abf601df7b1.tar.gz sandcrawler-18473bd57f9255ba2cd7fe9a75881abf601df7b1.zip