From c714ecdcd8aa8bb39b1b46860944b6cace7f5077 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 14 Sep 2020 14:18:08 -0700 Subject: skip citation_pdf_url if it is a link loop This may help get around link-loop errors for a specific version of OJS --- python/sandcrawler/html.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'python') diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py index 75e111e..a5cbaf5 100644 --- a/python/sandcrawler/html.py +++ b/python/sandcrawler/html.py @@ -70,9 +70,15 @@ def extract_fulltext_url(html_url, html_body): if '://doi.org/' in url: print(f"\tdoi.org in citation_pdf_url (loop?): {url}", file=sys.stderr) elif url.startswith('/'): - return dict(pdf_url=host_prefix+url, technique='citation_pdf_url') + if host_prefix+url == html_url: + print(f"\tavoiding citation_pdf_url link-loop", file=sys.stderr) + else: + return dict(pdf_url=host_prefix+url, technique='citation_pdf_url') elif url.startswith('http'): - return dict(pdf_url=url, technique='citation_pdf_url') + if url == html_url: + print(f"\tavoiding citation_pdf_url link-loop", file=sys.stderr) + else: + return dict(pdf_url=url, technique='citation_pdf_url') else: print("\tmalformed citation_pdf_url? {}".format(url), file=sys.stderr) -- cgit v1.2.3