From 9d81f6e3f8a4b300c18a831e80880a8e181f812f Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@archive.org>
Date: Sat, 8 Aug 2020 16:55:27 -0700
Subject: add more HTML extraction tricks

---
 python/sandcrawler/html.py | 31 +++++++++++++++++++++++++++++--
 1 file changed, 29 insertions(+), 2 deletions(-)

diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py
index 0e64c45..85d32c0 100644
--- a/python/sandcrawler/html.py
+++ b/python/sandcrawler/html.py
@@ -54,6 +54,8 @@ def extract_fulltext_url(html_url, html_body):
     meta = soup.find('meta', attrs={"name":"citation_pdf_url"})
     if not meta:
         meta = soup.find('meta', attrs={"name":"bepress_citation_pdf_url"})
+    if not meta:
+        meta = soup.find('meta', attrs={"name":"wkhealth_pdf_url"})
     if not meta:
         # researchgate does this; maybe others also?
         meta = soup.find('meta', attrs={"property":"citation_pdf_url"})
@@ -63,12 +65,19 @@ def extract_fulltext_url(html_url, html_body):
     # wiley has a weird almost-blank page we don't want to loop on
     if meta and not "://onlinelibrary.wiley.com/doi/pdf/" in html_url:
         url = meta['content'].strip()
-        if url.startswith('/'):
+        if '://doi.org/' in url:
+            print(f"\tdoi.org in citation_pdf_url (loop?): {url}", file=sys.stderr)
+        elif url.startswith('/'):
             return dict(pdf_url=host_prefix+url, technique='citation_pdf_url')
         elif url.startswith('http'):
             return dict(pdf_url=url, technique='citation_pdf_url')
         else:
-            print("malformed citation_pdf_url? {}".format(url), file=sys.stderr)
+            print("\tmalformed citation_pdf_url? {}".format(url), file=sys.stderr)
+
+    meta = soup.find('meta', attrs={"name":"generator"})
+    meta_generator = None
+    if meta and meta.get('content'):
+        meta_generator = meta['content'].strip()
 
     # sage, and also utpjournals (see below)
     # https://journals.sagepub.com/doi/10.1177/2309499019888836
@@ -345,6 +354,24 @@ def extract_fulltext_url(html_url, html_body):
         if record_id.isdigit() and url.encode('utf-8') in html_body:
             return dict(pdf_url=url, technique='rwth-aachen-url')
 
+    # physchemaspects.ru
+    if '://physchemaspects.ru/' in html_url and soup:
+        for href in soup.find_all('a'):
+            if href.text == "download PDF file":
+                url = href['href']
+                if url.startswith('/'):
+                    url = host_prefix + url
+                return dict(pdf_url=url, technique='physchemaspects-href')
+
+    # OJS 3 (some)
+    if meta_generator and meta_generator.startswith("Open Journal Systems"):
+        href = soup.find('a', attrs={"class":"obj_galley_link file"})
+        if href and href.text and "pdf" in href.text.lower():
+            url = href['href'].strip()
+            if url.startswith('/'):
+                url = host_prefix + url
+            return dict(pdf_url=url, technique='ojs-galley-href')
+
     ### below here we are doing guesses
 
     # generic guess: try current URL plus .pdf, if it exists in the HTML body
-- 
cgit v1.2.3