html: remove old citation_pdf_url code path

This code path doesn't check for 'skip' patterns, resulting in a bunch of bad CDX checks/errors
author: Bryan Newbold <bnewbold@archive.org> 2022-07-15 14:41:24 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2022-07-15 14:41:26 -0700
commit: fbc9b8e6edd437392f722112904c3bc1d32ff0e8 (patch)
tree: bf45131fd72ded232f46685d7da4e782c8078abe /python
parent: fe18ff4478de8481b732dce1408a39b1d3c2795d (diff)
download: sandcrawler-fbc9b8e6edd437392f722112904c3bc1d32ff0e8.tar.gz
sandcrawler-fbc9b8e6edd437392f722112904c3bc1d32ff0e8.zip
1 files changed, 1 insertions, 32 deletions
diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py
index f73b579..73c808c 100644
--- a/python/sandcrawler/html.py
+++ b/python/sandcrawler/html.py
@@ -38,38 +38,7 @@ def extract_fulltext_url(html_url: str, html_body: bytes) -> Dict[str, str]:
     redirect: Any
 
     ### General Tricks ###
-
-    # highwire-style meta tag
-    meta = soup.find("meta", attrs={"name": "citation_pdf_url"})
-    if not meta:
-        meta = soup.find("meta", attrs={"name": "bepress_citation_pdf_url"})
-    if not meta:
-        meta = soup.find("meta", attrs={"name": "wkhealth_pdf_url"})
-    if not meta:
-        # researchgate does this; maybe others also?
-        meta = soup.find("meta", attrs={"property": "citation_pdf_url"})
-    if not meta:
-        meta = soup.find("meta", attrs={"name": "eprints.document_url"})
-    # if tag is only partially populated
-    if meta and not meta.get("content"):
-        meta = None
-    # wiley has a weird almost-blank page we don't want to loop on
-    if meta and "://onlinelibrary.wiley.com/doi/pdf/" not in html_url:
-        url = meta["content"].strip()
-        if "://doi.org/" in url:
-            print(f"\tdoi.org in citation_pdf_url (loop?): {url}", file=sys.stderr)
-        elif url.startswith("/"):
-            if host_prefix + url == html_url:
-                print("\tavoiding citation_pdf_url link-loop", file=sys.stderr)
-            else:
-                return dict(pdf_url=host_prefix + url, technique="citation_pdf_url")
-        elif url.startswith("http"):
-            if url == html_url:
-                print("\tavoiding citation_pdf_url link-loop", file=sys.stderr)
-            else:
-                return dict(pdf_url=url, technique="citation_pdf_url")
-        else:
-            print("\tmalformed citation_pdf_url? {}".format(url), file=sys.stderr)
+    # note: most of these have migrated to the html_biblio code path
 
     meta = soup.find("meta", attrs={"name": "generator"})
     meta_generator = None
author	Bryan Newbold <bnewbold@archive.org>	2022-07-15 14:41:24 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2022-07-15 14:41:26 -0700
commit	fbc9b8e6edd437392f722112904c3bc1d32ff0e8 (patch)
tree	bf45131fd72ded232f46685d7da4e782c8078abe /python
parent	fe18ff4478de8481b732dce1408a39b1d3c2795d (diff)
download	sandcrawler-fbc9b8e6edd437392f722112904c3bc1d32ff0e8.tar.gz sandcrawler-fbc9b8e6edd437392f722112904c3bc1d32ff0e8.zip