fill in more html extraction techniques

author: Bryan Newbold <bnewbold@archive.org> 2020-01-09 17:53:42 -0800
committer: Bryan Newbold <bnewbold@archive.org> 2020-01-09 17:53:42 -0800
commit: d76e287a3b40370bcdd020c0560b14769f8bd009 (patch)
tree: 7f8a7d53513148e4006108a416b59802296b87c0
parent: 24185837a47f305757a5c783b95ca25b709f66e3 (diff)
download: sandcrawler-d76e287a3b40370bcdd020c0560b14769f8bd009.tar.gz
sandcrawler-d76e287a3b40370bcdd020c0560b14769f8bd009.zip
1 files changed, 6 insertions, 7 deletions
diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py
index 780dcb2..25587e6 100644
--- a/python/sandcrawler/html.py
+++ b/python/sandcrawler/html.py
@@ -51,7 +51,7 @@ def extract_fulltext_url(html_url, html_body):
         if anchor:
             url = anchor['href'].strip()
             assert '.pdf' in url
-            return dict(pdf_url=url)
+            return dict(pdf_url=url, technique='publisher')
 
     # research square (researchsquare.com)
     if 'researchsquare.com/article/' in html_url:
@@ -61,17 +61,16 @@ def extract_fulltext_url(html_url, html_body):
         if m:
             url = m.group(1)
             assert len(url) < 1024
-            return dict(release_stage="manuscript", pdf_url=url)
+            return dict(release_stage="manuscript", pdf_url=url, technique='publisher')
 
-    # ehp.niehs.nih.gov
-    # <a href="/doi/pdf/10.1289/EHP3950">
+    # elseiver linking hub
     if '://linkinghub.elsevier.com/retrieve/pii/' in html_url:
         redirect = soup.find("input", attrs={"name": "redirectURL"})
         if redirect:
             url = redirect['value'].strip()
             if 'sciencedirect.com' in url:
                 url = urllib.parse.unquote(url)
-                return dict(next_url=url)
+                return dict(next_url=url, technique="publisher")
 
     # ieeexplore.ieee.org
     # https://ieeexplore.ieee.org/document/8730316
@@ -82,14 +81,14 @@ def extract_fulltext_url(html_url, html_body):
         if m:
             url = m.group(1)
             assert len(url) < 1024
-            return dict(release_stage="published", pdf_url=host_prefix+url)
+            return dict(release_stage="published", pdf_url=host_prefix+url, technique="publisher")
     # https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=8730313
     if '://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber' in html_url:
         # HTML iframe like:
         # <iframe src="http://web.archive.org/web/20191026011528if_/https://ieeexplore.ieee.org/ielx7/6287639/8600701/08730313.pdf?tp=&amp;arnumber=8730313&amp;isnumber=8600701&amp;ref=" frameborder="0"></iframe>
         iframe = soup.find("iframe")
         if iframe and '.pdf' in iframe['src']:
-            return dict(pdf_url=iframe['src'])
+            return dict(pdf_url=iframe['src'], technique="iframe")
 
     # TODO: hrmars.com. anchor with .pdf href, and anchor text is "PDF"
author	Bryan Newbold <bnewbold@archive.org>	2020-01-09 17:53:42 -0800
committer	Bryan Newbold <bnewbold@archive.org>	2020-01-09 17:53:42 -0800
commit	d76e287a3b40370bcdd020c0560b14769f8bd009 (patch)
tree	7f8a7d53513148e4006108a416b59802296b87c0
parent	24185837a47f305757a5c783b95ca25b709f66e3 (diff)
download	sandcrawler-d76e287a3b40370bcdd020c0560b14769f8bd009.tar.gz sandcrawler-d76e287a3b40370bcdd020c0560b14769f8bd009.zip