aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/html.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/sandcrawler/html.py')
-rw-r--r--python/sandcrawler/html.py51
1 files changed, 19 insertions, 32 deletions
diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py
index 50183be..207f067 100644
--- a/python/sandcrawler/html.py
+++ b/python/sandcrawler/html.py
@@ -38,38 +38,7 @@ def extract_fulltext_url(html_url: str, html_body: bytes) -> Dict[str, str]:
redirect: Any
### General Tricks ###
-
- # highwire-style meta tag
- meta = soup.find("meta", attrs={"name": "citation_pdf_url"})
- if not meta:
- meta = soup.find("meta", attrs={"name": "bepress_citation_pdf_url"})
- if not meta:
- meta = soup.find("meta", attrs={"name": "wkhealth_pdf_url"})
- if not meta:
- # researchgate does this; maybe others also?
- meta = soup.find("meta", attrs={"property": "citation_pdf_url"})
- if not meta:
- meta = soup.find("meta", attrs={"name": "eprints.document_url"})
- # if tag is only partially populated
- if meta and not meta.get("content"):
- meta = None
- # wiley has a weird almost-blank page we don't want to loop on
- if meta and "://onlinelibrary.wiley.com/doi/pdf/" not in html_url:
- url = meta["content"].strip()
- if "://doi.org/" in url:
- print(f"\tdoi.org in citation_pdf_url (loop?): {url}", file=sys.stderr)
- elif url.startswith("/"):
- if host_prefix + url == html_url:
- print("\tavoiding citation_pdf_url link-loop", file=sys.stderr)
- else:
- return dict(pdf_url=host_prefix + url, technique="citation_pdf_url")
- elif url.startswith("http"):
- if url == html_url:
- print("\tavoiding citation_pdf_url link-loop", file=sys.stderr)
- else:
- return dict(pdf_url=url, technique="citation_pdf_url")
- else:
- print("\tmalformed citation_pdf_url? {}".format(url), file=sys.stderr)
+ # note: most of these have migrated to the html_biblio code path
meta = soup.find("meta", attrs={"name": "generator"})
meta_generator = None
@@ -343,6 +312,24 @@ def extract_fulltext_url(html_url: str, html_body: bytes) -> Dict[str, str]:
url = html_url + "pdf"
return dict(pdf_url=url, technique="jmir-url")
+ # Google Drive
+ # this is assuming it is a PDF
+ if "drive.google.com/file/d/" in html_url and "/view" in html_url:
+ gdrive_id = html_url.split("/")[5]
+ if len(gdrive_id) > 10:
+ # https://drive.google.com/uc?export=download&id=15DnbNMZTbRHHqKj8nFaikGSd1-OyoJ24
+ return dict(
+ pdf_url=f"https://drive.google.com/uc?export=download&id={gdrive_id}",
+ technique="google-drive",
+ )
+
+ # https://doi.org/10.24850/j-tyca-14-4-7
+ # https://docs.google.com/viewer?url=http://revistatyca.org.mx/index.php/tyca/libraryFiles/downloadPublic/150
+ if "docs.google.com/viewer?url=" in html_url:
+ original_url = html_url.split("?url=")[1]
+ if original_url:
+ return dict(pdf_url=original_url, technique="docs.google.com viewer")
+
### below here we are doing guesses
# generic guess: try current URL plus .pdf, if it exists in the HTML body