skip citation_pdf_url if it is a link loop

This may help get around link-loop errors for a specific version of OJS
author: Bryan Newbold <bnewbold@archive.org> 2020-09-14 14:18:08 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2020-09-14 14:18:10 -0700
commit: c714ecdcd8aa8bb39b1b46860944b6cace7f5077 (patch)
tree: 5459c7476003d95fc451b06365745806e8766b54 /python
parent: 4bd6d443457bed67039c8d8a56a184c1f70247b6 (diff)
download: sandcrawler-c714ecdcd8aa8bb39b1b46860944b6cace7f5077.tar.gz
sandcrawler-c714ecdcd8aa8bb39b1b46860944b6cace7f5077.zip
1 files changed, 8 insertions, 2 deletions
diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py
index 75e111e..a5cbaf5 100644
--- a/python/sandcrawler/html.py
+++ b/python/sandcrawler/html.py
@@ -70,9 +70,15 @@ def extract_fulltext_url(html_url, html_body):
         if '://doi.org/' in url:
             print(f"\tdoi.org in citation_pdf_url (loop?): {url}", file=sys.stderr)
         elif url.startswith('/'):
-            return dict(pdf_url=host_prefix+url, technique='citation_pdf_url')
+            if host_prefix+url == html_url:
+                print(f"\tavoiding citation_pdf_url link-loop", file=sys.stderr)
+            else:
+                return dict(pdf_url=host_prefix+url, technique='citation_pdf_url')
         elif url.startswith('http'):
-            return dict(pdf_url=url, technique='citation_pdf_url')
+            if url == html_url:
+                print(f"\tavoiding citation_pdf_url link-loop", file=sys.stderr)
+            else:
+                return dict(pdf_url=url, technique='citation_pdf_url')
         else:
             print("\tmalformed citation_pdf_url? {}".format(url), file=sys.stderr)
author	Bryan Newbold <bnewbold@archive.org>	2020-09-14 14:18:08 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2020-09-14 14:18:10 -0700
commit	c714ecdcd8aa8bb39b1b46860944b6cace7f5077 (patch)
tree	5459c7476003d95fc451b06365745806e8766b54 /python
parent	4bd6d443457bed67039c8d8a56a184c1f70247b6 (diff)
download	sandcrawler-c714ecdcd8aa8bb39b1b46860944b6cace7f5077.tar.gz sandcrawler-c714ecdcd8aa8bb39b1b46860944b6cace7f5077.zip