aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-08-08 16:55:27 -0700
committerBryan Newbold <bnewbold@archive.org>2020-08-08 17:06:02 -0700
commit9d81f6e3f8a4b300c18a831e80880a8e181f812f (patch)
treecc71902c988a36c00f8a149364d456c9be4bfeb2 /python
parentc19b73f13b021a6d3026d0526b7dfa7a9fdda3a6 (diff)
downloadsandcrawler-9d81f6e3f8a4b300c18a831e80880a8e181f812f.tar.gz
sandcrawler-9d81f6e3f8a4b300c18a831e80880a8e181f812f.zip
add more HTML extraction tricks
Diffstat (limited to 'python')
-rw-r--r--python/sandcrawler/html.py31
1 files changed, 29 insertions, 2 deletions
diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py
index 0e64c45..85d32c0 100644
--- a/python/sandcrawler/html.py
+++ b/python/sandcrawler/html.py
@@ -55,6 +55,8 @@ def extract_fulltext_url(html_url, html_body):
if not meta:
meta = soup.find('meta', attrs={"name":"bepress_citation_pdf_url"})
if not meta:
+ meta = soup.find('meta', attrs={"name":"wkhealth_pdf_url"})
+ if not meta:
# researchgate does this; maybe others also?
meta = soup.find('meta', attrs={"property":"citation_pdf_url"})
# if tag is only partially populated
@@ -63,12 +65,19 @@ def extract_fulltext_url(html_url, html_body):
# wiley has a weird almost-blank page we don't want to loop on
if meta and not "://onlinelibrary.wiley.com/doi/pdf/" in html_url:
url = meta['content'].strip()
- if url.startswith('/'):
+ if '://doi.org/' in url:
+ print(f"\tdoi.org in citation_pdf_url (loop?): {url}", file=sys.stderr)
+ elif url.startswith('/'):
return dict(pdf_url=host_prefix+url, technique='citation_pdf_url')
elif url.startswith('http'):
return dict(pdf_url=url, technique='citation_pdf_url')
else:
- print("malformed citation_pdf_url? {}".format(url), file=sys.stderr)
+ print("\tmalformed citation_pdf_url? {}".format(url), file=sys.stderr)
+
+ meta = soup.find('meta', attrs={"name":"generator"})
+ meta_generator = None
+ if meta and meta.get('content'):
+ meta_generator = meta['content'].strip()
# sage, and also utpjournals (see below)
# https://journals.sagepub.com/doi/10.1177/2309499019888836
@@ -345,6 +354,24 @@ def extract_fulltext_url(html_url, html_body):
if record_id.isdigit() and url.encode('utf-8') in html_body:
return dict(pdf_url=url, technique='rwth-aachen-url')
+ # physchemaspects.ru
+ if '://physchemaspects.ru/' in html_url and soup:
+ for href in soup.find_all('a'):
+ if href.text == "download PDF file":
+ url = href['href']
+ if url.startswith('/'):
+ url = host_prefix + url
+ return dict(pdf_url=url, technique='physchemaspects-href')
+
+ # OJS 3 (some)
+ if meta_generator and meta_generator.startswith("Open Journal Systems"):
+ href = soup.find('a', attrs={"class":"obj_galley_link file"})
+ if href and href.text and "pdf" in href.text.lower():
+ url = href['href'].strip()
+ if url.startswith('/'):
+ url = host_prefix + url
+ return dict(pdf_url=url, technique='ojs-galley-href')
+
### below here we are doing guesses
# generic guess: try current URL plus .pdf, if it exists in the HTML body