add more HTML extraction tricks

author: Bryan Newbold <bnewbold@archive.org> 2020-08-08 16:55:27 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2020-08-08 17:06:02 -0700
commit: 9d81f6e3f8a4b300c18a831e80880a8e181f812f (patch)
tree: cc71902c988a36c00f8a149364d456c9be4bfeb2 /python/sandcrawler/html.py
parent: c19b73f13b021a6d3026d0526b7dfa7a9fdda3a6 (diff)
download: sandcrawler-9d81f6e3f8a4b300c18a831e80880a8e181f812f.tar.gz
sandcrawler-9d81f6e3f8a4b300c18a831e80880a8e181f812f.zip
1 files changed, 29 insertions, 2 deletions
diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py
index 0e64c45..85d32c0 100644
--- a/python/sandcrawler/html.py
+++ b/python/sandcrawler/html.py
@@ -55,6 +55,8 @@ def extract_fulltext_url(html_url, html_body):
     if not meta:
         meta = soup.find('meta', attrs={"name":"bepress_citation_pdf_url"})
     if not meta:
+        meta = soup.find('meta', attrs={"name":"wkhealth_pdf_url"})
+    if not meta:
         # researchgate does this; maybe others also?
         meta = soup.find('meta', attrs={"property":"citation_pdf_url"})
     # if tag is only partially populated
@@ -63,12 +65,19 @@ def extract_fulltext_url(html_url, html_body):
     # wiley has a weird almost-blank page we don't want to loop on
     if meta and not "://onlinelibrary.wiley.com/doi/pdf/" in html_url:
         url = meta['content'].strip()
-        if url.startswith('/'):
+        if '://doi.org/' in url:
+            print(f"\tdoi.org in citation_pdf_url (loop?): {url}", file=sys.stderr)
+        elif url.startswith('/'):
             return dict(pdf_url=host_prefix+url, technique='citation_pdf_url')
         elif url.startswith('http'):
             return dict(pdf_url=url, technique='citation_pdf_url')
         else:
-            print("malformed citation_pdf_url? {}".format(url), file=sys.stderr)
+            print("\tmalformed citation_pdf_url? {}".format(url), file=sys.stderr)
+
+    meta = soup.find('meta', attrs={"name":"generator"})
+    meta_generator = None
+    if meta and meta.get('content'):
+        meta_generator = meta['content'].strip()
 
     # sage, and also utpjournals (see below)
     # https://journals.sagepub.com/doi/10.1177/2309499019888836
@@ -345,6 +354,24 @@ def extract_fulltext_url(html_url, html_body):
         if record_id.isdigit() and url.encode('utf-8') in html_body:
             return dict(pdf_url=url, technique='rwth-aachen-url')
 
+    # physchemaspects.ru
+    if '://physchemaspects.ru/' in html_url and soup:
+        for href in soup.find_all('a'):
+            if href.text == "download PDF file":
+                url = href['href']
+                if url.startswith('/'):
+                    url = host_prefix + url
+                return dict(pdf_url=url, technique='physchemaspects-href')
+
+    # OJS 3 (some)
+    if meta_generator and meta_generator.startswith("Open Journal Systems"):
+        href = soup.find('a', attrs={"class":"obj_galley_link file"})
+        if href and href.text and "pdf" in href.text.lower():
+            url = href['href'].strip()
+            if url.startswith('/'):
+                url = host_prefix + url
+            return dict(pdf_url=url, technique='ojs-galley-href')
+
     ### below here we are doing guesses
 
     # generic guess: try current URL plus .pdf, if it exists in the HTML body
author	Bryan Newbold <bnewbold@archive.org>	2020-08-08 16:55:27 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2020-08-08 17:06:02 -0700
commit	9d81f6e3f8a4b300c18a831e80880a8e181f812f (patch)
tree	cc71902c988a36c00f8a149364d456c9be4bfeb2 /python/sandcrawler/html.py
parent	c19b73f13b021a6d3026d0526b7dfa7a9fdda3a6 (diff)
download	sandcrawler-9d81f6e3f8a4b300c18a831e80880a8e181f812f.tar.gz sandcrawler-9d81f6e3f8a4b300c18a831e80880a8e181f812f.zip