html parse: add another generic fulltext pattern

author: Bryan Newbold <bnewbold@archive.org> 2020-09-14 14:14:22 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2020-09-14 14:15:59 -0700
commit: 4bd6d443457bed67039c8d8a56a184c1f70247b6 (patch)
tree: 90958ee00beb856ac7aa59b5ba4d855cf281a6f5
parent: ee6129ea884036b666de7cff4ad7891675a52b3c (diff)
download: sandcrawler-4bd6d443457bed67039c8d8a56a184c1f70247b6.tar.gz
sandcrawler-4bd6d443457bed67039c8d8a56a184c1f70247b6.zip
1 files changed, 10 insertions, 1 deletions
diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py
index acf1522..75e111e 100644
--- a/python/sandcrawler/html.py
+++ b/python/sandcrawler/html.py
@@ -95,13 +95,22 @@ def extract_fulltext_url(html_url, html_body):
     #   https://pubs.acs.org/doi/10.1021/acs.estlett.9b00379
     #   <a href="/doi/pdf/10.1021/acs.estlett.9b00379" title="PDF" target="_blank" class="button_primary"><i class="icon-file-pdf-o"></i><span>PDF (1 MB)</span></a>
     href = soup.find('a', attrs={"title":"PDF"})
-    if href and 'href' in href:
+    if href and href.get('href'):
         url = href['href'].strip()
         if url.startswith('http'):
             return dict(pdf_url=url, technique='href_title')
         elif url.startswith('/'):
             return dict(pdf_url=host_prefix+url, technique='href_title')
 
+    # http://www.revistas.unam.mx/index.php/rep/article/view/35503/32336
+    href = soup.find('a', attrs={"id":"pdfDownloadLink"})
+    if href and href.get('href'):
+        url = href['href'].strip()
+        if url.startswith('http'):
+            return dict(pdf_url=url, technique='href_pdfDownloadLink')
+        elif url.startswith('/'):
+            return dict(pdf_url=host_prefix+url, technique='href_pdfDownloadLink')
+
     # http://www.jasstudies.com/DergiTamDetay.aspx?ID=3401
     # <embed src="/files/jass_makaleler/1359848334_33-Okt.%20Yasemin%20KARADEM%C4%B0R.pdf" type="application/pdf" />
     embed = soup.find('embed', attrs={"type": "application/pdf"})
author	Bryan Newbold <bnewbold@archive.org>	2020-09-14 14:14:22 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2020-09-14 14:15:59 -0700
commit	4bd6d443457bed67039c8d8a56a184c1f70247b6 (patch)
tree	90958ee00beb856ac7aa59b5ba4d855cf281a6f5
parent	ee6129ea884036b666de7cff4ad7891675a52b3c (diff)
download	sandcrawler-4bd6d443457bed67039c8d8a56a184c1f70247b6.tar.gz sandcrawler-4bd6d443457bed67039c8d8a56a184c1f70247b6.zip