move some PDF URL extraction into declarative format

author: Bryan Newbold <bnewbold@archive.org> 2020-11-08 21:54:24 -0800
committer: Bryan Newbold <bnewbold@archive.org> 2020-11-08 21:54:24 -0800
commit: a68aadc4107fc68dc2748c52dab8a4bd92cca022 (patch)
tree: da3da0a847d5c10dee873e8bce8198a39c12ce1f /python
parent: 6a701f966b8bc760bf904c0569562b0159e13559 (diff)
download: sandcrawler-a68aadc4107fc68dc2748c52dab8a4bd92cca022.tar.gz
sandcrawler-a68aadc4107fc68dc2748c52dab8a4bd92cca022.zip
5 files changed, 177 insertions, 143 deletions
diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py
index 70761a3..14561bf 100644
--- a/python/sandcrawler/html.py
+++ b/python/sandcrawler/html.py
@@ -11,24 +11,6 @@ IEEEXPLORE_REGEX = re.compile(r'"pdfPath":"(/.*?\.pdf)"')
 OVID_JOURNAL_URL_REGEX = re.compile(r'journalURL = "(http.*)";')
 SCIENCEDIRECT_BOUNCE_URL_REGEX = re.compile(r"window.location = '(http.*)';")
 
-def test_regex():
-    lines = """
-    blah
-    var journalURL = "https://journals.lww.com/co-urology/fulltext/10.1097/MOU.0000000000000689";
-    asdf"""
-    m = OVID_JOURNAL_URL_REGEX.search(lines)
-    assert m.group(1) == "https://journals.lww.com/co-urology/fulltext/10.1097/MOU.0000000000000689"
-
-    lines = """
-            window.onload = function () {
-                window.location = 'https://pdf.sciencedirectassets.com/320270/AIP/1-s2.0-S2590109519300424/main.pdf?X-Amz-Security-Token=IQoJb3JpZ2luX2VjEH0aCXVzLWVhc3QtMSJGMEQCICBF0dnrtKfpcs3T1kOjMS9w9gedqiLBrcbp4aKQSP8fAiAT9G426t6FWXHO2zPSXRFLq2eiqgbew2vkNKbcn87teyq9Awj1%2F%2F%2F%2F%2F%2F%2F%2F%2F%2F8BEAIaDDA1OTAwMzU0Njg2NSIMnZcTRhbvMwF%2F5PA5KpEDdN%2FDI4V%2BNMDWQDFeAdUc99Lyxak%2B6vhAsfCBCf8hhvrRpalz75e74%2FXMAQwMN9m6i98o0Ljv9od7cuQEy8t%2B0DLzjzX5n3%2FxmpttowhMUm1jc8tBniLKBjwhTyiSHwhdeaVZf6x2zCJ0EIOWMNJHp3iFEqpaFvkRZbC1KWK4XPNNKo72HCvXuG7xmGrdHByz91AP7UgIYCy4hT10fnM43gbOE4wW8fqpgnvwCId%2F2u8k4rQoCLBqLYZzqshCRm1DBbsXCQhTwDXiMC2Ek3f63yKgw7rRCAxvs0vqirG%2B4mJ6LADaztAFMtKDPfnd4e%2B7%2FvnKU2NeotrqrkRgOkIAoFumbQXf20ky6mKWyHBk%2FxirVp60vUcLQpUm2Pcp6ythYxUi9IJxRGX8EF6aV4UHuCpUDUE7o8N84KUXIedUpytUZx7Xoxfk9w%2BR3%2FgX4LEHfkrWgiFAS3bVxNGOeV7GTwcXdcAggbdCaiAe46dfv7DDedx0KhVKOPH7obfvShqd6TYc0BjrV4sx61594ZJ3%2FO0ws7Lj8AU67AF17%2B1NZ3Ugu%2BwG9Ys9s7OxG8E4kBJ58vEY1yuBOQK9y2we4%2FTGPuqSxCuezqA%2BseslXYP%2FRc%2FZL9xx%2FUYaSjZhk1p1mhojxgBrckJYU7d8c4ELMPmtVy6R1yd2VDUoawEU8SB7nbNnMKzqQ3RgGgqGJiELys6dt%2FIr%2BVhpqM%2FZT4zadvzs8P%2FLoGzUHJKNZt0f99wLvZilphV92E%2BOUnwC4wbg3i3af3zozULwgEr7T%2FX2VsyREgexlzk76qMALPn0lgnciUyyQXxyUWAilXYQ0mQdXefh9lFfycczvt0UEuarX9p1sMwl8Ve5aw%3D%3D&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Date=20200110T210936Z&X-Amz-SignedHeaders=host&X-Amz-Expires=300&X-Amz-Credential=ASIAQ3PHCVTY23CMDBNC%2F20200110%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Signature=b43525576e1a0fdbab581481a3fe6db2862cbb2c69f2860b70cc8d444ccd73d5&hash=ccd128dfe597e704224bdfb4b3358de29b2be5d95887c71076bdab1236ba9e42&host=68042c943591013ac2b2430a89b270f6af2c76d8dfd086a07176afe7c76c2c61&pii=S2590109519300424&tid=spdf-74468ebd-6be6-43ac-b294-ced86e8eea58&sid=f9676d658285a749c46b6d081d965bb12aa8gxrqa&type=client';
-                refreshOriginalWindow();
-            }
-    """
-    url = "https://pdf.sciencedirectassets.com/320270/AIP/1-s2.0-S2590109519300424/main.pdf?X-Amz-Security-Token=IQoJb3JpZ2luX2VjEH0aCXVzLWVhc3QtMSJGMEQCICBF0dnrtKfpcs3T1kOjMS9w9gedqiLBrcbp4aKQSP8fAiAT9G426t6FWXHO2zPSXRFLq2eiqgbew2vkNKbcn87teyq9Awj1%2F%2F%2F%2F%2F%2F%2F%2F%2F%2F8BEAIaDDA1OTAwMzU0Njg2NSIMnZcTRhbvMwF%2F5PA5KpEDdN%2FDI4V%2BNMDWQDFeAdUc99Lyxak%2B6vhAsfCBCf8hhvrRpalz75e74%2FXMAQwMN9m6i98o0Ljv9od7cuQEy8t%2B0DLzjzX5n3%2FxmpttowhMUm1jc8tBniLKBjwhTyiSHwhdeaVZf6x2zCJ0EIOWMNJHp3iFEqpaFvkRZbC1KWK4XPNNKo72HCvXuG7xmGrdHByz91AP7UgIYCy4hT10fnM43gbOE4wW8fqpgnvwCId%2F2u8k4rQoCLBqLYZzqshCRm1DBbsXCQhTwDXiMC2Ek3f63yKgw7rRCAxvs0vqirG%2B4mJ6LADaztAFMtKDPfnd4e%2B7%2FvnKU2NeotrqrkRgOkIAoFumbQXf20ky6mKWyHBk%2FxirVp60vUcLQpUm2Pcp6ythYxUi9IJxRGX8EF6aV4UHuCpUDUE7o8N84KUXIedUpytUZx7Xoxfk9w%2BR3%2FgX4LEHfkrWgiFAS3bVxNGOeV7GTwcXdcAggbdCaiAe46dfv7DDedx0KhVKOPH7obfvShqd6TYc0BjrV4sx61594ZJ3%2FO0ws7Lj8AU67AF17%2B1NZ3Ugu%2BwG9Ys9s7OxG8E4kBJ58vEY1yuBOQK9y2we4%2FTGPuqSxCuezqA%2BseslXYP%2FRc%2FZL9xx%2FUYaSjZhk1p1mhojxgBrckJYU7d8c4ELMPmtVy6R1yd2VDUoawEU8SB7nbNnMKzqQ3RgGgqGJiELys6dt%2FIr%2BVhpqM%2FZT4zadvzs8P%2FLoGzUHJKNZt0f99wLvZilphV92E%2BOUnwC4wbg3i3af3zozULwgEr7T%2FX2VsyREgexlzk76qMALPn0lgnciUyyQXxyUWAilXYQ0mQdXefh9lFfycczvt0UEuarX9p1sMwl8Ve5aw%3D%3D&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Date=20200110T210936Z&X-Amz-SignedHeaders=host&X-Amz-Expires=300&X-Amz-Credential=ASIAQ3PHCVTY23CMDBNC%2F20200110%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Signature=b43525576e1a0fdbab581481a3fe6db2862cbb2c69f2860b70cc8d444ccd73d5&hash=ccd128dfe597e704224bdfb4b3358de29b2be5d95887c71076bdab1236ba9e42&host=68042c943591013ac2b2430a89b270f6af2c76d8dfd086a07176afe7c76c2c61&pii=S2590109519300424&tid=spdf-74468ebd-6be6-43ac-b294-ced86e8eea58&sid=f9676d658285a749c46b6d081d965bb12aa8gxrqa&type=client"
-    m = SCIENCEDIRECT_BOUNCE_URL_REGEX.search(lines)
-    assert m.group(1) == url
-
 
 def extract_fulltext_url(html_url, html_body):
     """
@@ -87,56 +69,8 @@ def extract_fulltext_url(html_url, html_body):
     if meta and meta.get('content'):
         meta_generator = meta['content'].strip()
 
-    # sage, and also utpjournals (see below)
-    # https://journals.sagepub.com/doi/10.1177/2309499019888836
-    # <a href="http://journals.sagepub.com/doi/pdf/10.1177/2309499019888836" class="show-pdf" target="_self">
-    # <a href="http://utpjournals.press/doi/pdf/10.3138/cjh.ach.54.1-2.05" class="show-pdf" target="_blank">
-    href = soup.find('a', attrs={"class":"show-pdf"})
-    if href:
-        url = href['href'].strip()
-        if url.startswith('http'):
-            return dict(pdf_url=url, technique='href_show-pdf')
-
-    # ACS (and probably others) like:
-    #   https://pubs.acs.org/doi/10.1021/acs.estlett.9b00379
-    #   <a href="/doi/pdf/10.1021/acs.estlett.9b00379" title="PDF" target="_blank" class="button_primary"><i class="icon-file-pdf-o"></i><span>PDF (1 MB)</span></a>
-    href = soup.find('a', attrs={"title":"PDF"})
-    if href and href.get('href'):
-        url = href['href'].strip()
-        if url.startswith('http'):
-            return dict(pdf_url=url, technique='href_title')
-        elif url.startswith('/'):
-            return dict(pdf_url=host_prefix+url, technique='href_title')
-
-    # http://www.revistas.unam.mx/index.php/rep/article/view/35503/32336
-    href = soup.find('a', attrs={"id":"pdfDownloadLink"})
-    if href and href.get('href'):
-        url = href['href'].strip()
-        if url.startswith('http'):
-            return dict(pdf_url=url, technique='href_pdfDownloadLink')
-        elif url.startswith('/'):
-            return dict(pdf_url=host_prefix+url, technique='href_pdfDownloadLink')
-
-    # http://www.jasstudies.com/DergiTamDetay.aspx?ID=3401
-    # <embed src="/files/jass_makaleler/1359848334_33-Okt.%20Yasemin%20KARADEM%C4%B0R.pdf" type="application/pdf" />
-    embed = soup.find('embed', attrs={"type": "application/pdf"})
-    if embed and embed.get('src'):
-        url = embed['src'].strip()
-        if url.startswith('/'):
-            url = host_prefix+url
-        if url.startswith('http'):
-            return dict(pdf_url=url, technique='embed_type')
-
     ### Publisher/Platform Specific ###
 
-    # eLife (elifesciences.org)
-    if '://elifesciences.org/articles/' in html_url:
-        anchor = soup.find("a", attrs={"data-download-type": "pdf-article"})
-        if anchor:
-            url = anchor['href'].strip()
-            assert '.pdf' in url
-            return dict(pdf_url=url, technique='publisher')
-
     # research square (researchsquare.com)
     if 'researchsquare.com/article/' in html_url:
         # JSON in body with a field like:
@@ -188,26 +122,6 @@ def extract_fulltext_url(html_url, html_body):
         if iframe and '.pdf' in iframe['src']:
             return dict(pdf_url=iframe['src'], technique="iframe")
 
-    # utpjournals.press
-    # https://utpjournals.press/doi/10.3138/cjh.ach.54.1-2.05
-    if '://utpjournals.press/doi/10.' in html_url:
-        # <a href="http://utpjournals.press/doi/pdf/10.3138/cjh.ach.54.1-2.05" class="show-pdf" target="_blank">
-        href = soup.find('a', attrs={"class":"show-pdf"})
-        if href:
-            url = href['href'].strip()
-            if url.startswith('http'):
-                return dict(pdf_url=url, technique='publisher-href')
-
-    # https://www.jcancer.org/v10p4038.htm
-    # simple journal-specific href
-    if '://www.jcancer.org/' in html_url and html_url.endswith(".htm"):
-        # <a href='v10p4038.pdf' class='textbutton'>PDF</a>
-        href = soup.find('a', attrs={"class":"textbutton"})
-        if href:
-            url = href['href'].strip()
-            if url.endswith(".pdf") and not "http" in url:
-                return dict(pdf_url=host_prefix+"/"+url, technique='journal-href')
-
     # https://insights.ovid.com/crossref?an=00042307-202001000-00013
     # Ovid is some kind of landing page bounce portal tracking run-around.
     # Can extract actual journal URL from javascript blob in the HTML
@@ -238,16 +152,6 @@ def extract_fulltext_url(html_url, html_body):
             next_url = html_url.replace('/doi/pdf/', '/doi/pdfdirect/')
             return dict(next_url=next_url, technique='wiley-pdfdirect')
 
-    # taylor and frances
-    # https://www.tandfonline.com/doi/full/10.1080/19491247.2019.1682234
-    # <a href="/doi/pdf/10.1080/19491247.2019.1682234?needAccess=true" class="show-pdf" target="_blank">
-    if "://www.tandfonline.com/doi/full/10." in html_url:
-        href = soup.find('a', attrs={"class":"show-pdf"})
-        if href:
-            url = href['href'].strip()
-            if "/pdf/" in url:
-                return dict(pdf_url=host_prefix+url, technique='publisher-href')
-
     # arxiv abstract pages
     if "://arxiv.org/abs/" in html_url:
         url = html_url.replace("/abs/", "/pdf/")
@@ -313,15 +217,6 @@ def extract_fulltext_url(html_url, html_body):
             url = html_url.replace('/doi/full/10.', '/doi/pdf/10.').replace('/doi/10.', '/doi/pdf/10.')
             return dict(pdf_url=url, technique='ehp.niehs.nigh.gov-url')
 
-    # journals.tsu.ru (and maybe others)
-    # http://journals.tsu.ru/psychology/&journal_page=archive&id=1815&article_id=40405
-    # <a class='file pdf' href='http://journals.tsu.ru/engine/download.php?id=150921&area=files'>Скачать электронную версию публикации</a>
-    href = soup.find('a', attrs={"class":"file pdf"})
-    if href:
-        url = href['href'].strip()
-        if url.startswith('http'):
-            return dict(pdf_url=url, technique='href_file_pdf-pdf')
-
     # cogentoa.com
     # https://www.cogentoa.com/article/10.1080/23311975.2017.1412873
     if "://www.cogentoa.com/article/" in html_url and not ".pdf" in html_url:
@@ -342,17 +237,6 @@ def extract_fulltext_url(html_url, html_body):
             if url and url.startswith('http'):
                 return dict(pdf_url=url, technique='figshare-json')
 
-    # eurosurveillance
-    # https://www.eurosurveillance.org/content/10.2807/1560-7917.ES.2020.25.11.2000230
-    if "://www.eurosurveillance.org/content/" in html_url:
-        # <a href="/deliver/fulltext/eurosurveillance/25/11/eurosurv-25-11-3.pdf?itemId=/content/10.2807/1560-7917.ES.2020.25.11.2000230&mimeType=pdf&containerItemId=content/eurosurveillance" class="pdf " title="Download" rel="http://instance.metastore.ingenta.com/content/10.2807/1560-7917.ES.2020.25.11.2000230" target="/content/10.2807/1560-7917.ES.2020.25.11.2000230-pdf" >
-        href = soup.find('a', attrs={"class":"pdf", "title": "Download"})
-        if href:
-            url = href['href'].strip()
-            if not url.startswith('http'):
-                url = host_prefix + url
-            return dict(pdf_url=url, technique='eurosurveillance-href')
-
     # CNKI COVID-19 landing pages
     # http://en.gzbd.cnki.net/gzbt/detail/detail.aspx?FileName=HBGF202002003&DbName=GZBJ7920&DbCode=GZBJ
     if '://en.gzbd.cnki.net/KCMS/detail/detail.aspx' in html_url:
@@ -410,3 +294,21 @@ def extract_fulltext_url(html_url, html_body):
             return dict(pdf_url=url, technique='guess-url-plus-pdf')
 
     return dict()
+
+def test_regex():
+    lines = """
+    blah
+    var journalURL = "https://journals.lww.com/co-urology/fulltext/10.1097/MOU.0000000000000689";
+    asdf"""
+    m = OVID_JOURNAL_URL_REGEX.search(lines)
+    assert m.group(1) == "https://journals.lww.com/co-urology/fulltext/10.1097/MOU.0000000000000689"
+
+    lines = """
+            window.onload = function () {
+                window.location = 'https://pdf.sciencedirectassets.com/320270/AIP/1-s2.0-S2590109519300424/main.pdf?X-Amz-Security-Token=IQoJb3JpZ2luX2VjEH0aCXVzLWVhc3QtMSJGMEQCICBF0dnrtKfpcs3T1kOjMS9w9gedqiLBrcbp4aKQSP8fAiAT9G426t6FWXHO2zPSXRFLq2eiqgbew2vkNKbcn87teyq9Awj1%2F%2F%2F%2F%2F%2F%2F%2F%2F%2F8BEAIaDDA1OTAwMzU0Njg2NSIMnZcTRhbvMwF%2F5PA5KpEDdN%2FDI4V%2BNMDWQDFeAdUc99Lyxak%2B6vhAsfCBCf8hhvrRpalz75e74%2FXMAQwMN9m6i98o0Ljv9od7cuQEy8t%2B0DLzjzX5n3%2FxmpttowhMUm1jc8tBniLKBjwhTyiSHwhdeaVZf6x2zCJ0EIOWMNJHp3iFEqpaFvkRZbC1KWK4XPNNKo72HCvXuG7xmGrdHByz91AP7UgIYCy4hT10fnM43gbOE4wW8fqpgnvwCId%2F2u8k4rQoCLBqLYZzqshCRm1DBbsXCQhTwDXiMC2Ek3f63yKgw7rRCAxvs0vqirG%2B4mJ6LADaztAFMtKDPfnd4e%2B7%2FvnKU2NeotrqrkRgOkIAoFumbQXf20ky6mKWyHBk%2FxirVp60vUcLQpUm2Pcp6ythYxUi9IJxRGX8EF6aV4UHuCpUDUE7o8N84KUXIedUpytUZx7Xoxfk9w%2BR3%2FgX4LEHfkrWgiFAS3bVxNGOeV7GTwcXdcAggbdCaiAe46dfv7DDedx0KhVKOPH7obfvShqd6TYc0BjrV4sx61594ZJ3%2FO0ws7Lj8AU67AF17%2B1NZ3Ugu%2BwG9Ys9s7OxG8E4kBJ58vEY1yuBOQK9y2we4%2FTGPuqSxCuezqA%2BseslXYP%2FRc%2FZL9xx%2FUYaSjZhk1p1mhojxgBrckJYU7d8c4ELMPmtVy6R1yd2VDUoawEU8SB7nbNnMKzqQ3RgGgqGJiELys6dt%2FIr%2BVhpqM%2FZT4zadvzs8P%2FLoGzUHJKNZt0f99wLvZilphV92E%2BOUnwC4wbg3i3af3zozULwgEr7T%2FX2VsyREgexlzk76qMALPn0lgnciUyyQXxyUWAilXYQ0mQdXefh9lFfycczvt0UEuarX9p1sMwl8Ve5aw%3D%3D&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Date=20200110T210936Z&X-Amz-SignedHeaders=host&X-Amz-Expires=300&X-Amz-Credential=ASIAQ3PHCVTY23CMDBNC%2F20200110%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Signature=b43525576e1a0fdbab581481a3fe6db2862cbb2c69f2860b70cc8d444ccd73d5&hash=ccd128dfe597e704224bdfb4b3358de29b2be5d95887c71076bdab1236ba9e42&host=68042c943591013ac2b2430a89b270f6af2c76d8dfd086a07176afe7c76c2c61&pii=S2590109519300424&tid=spdf-74468ebd-6be6-43ac-b294-ced86e8eea58&sid=f9676d658285a749c46b6d081d965bb12aa8gxrqa&type=client';
+                refreshOriginalWindow();
+            }
+    """
+    url = "https://pdf.sciencedirectassets.com/320270/AIP/1-s2.0-S2590109519300424/main.pdf?X-Amz-Security-Token=IQoJb3JpZ2luX2VjEH0aCXVzLWVhc3QtMSJGMEQCICBF0dnrtKfpcs3T1kOjMS9w9gedqiLBrcbp4aKQSP8fAiAT9G426t6FWXHO2zPSXRFLq2eiqgbew2vkNKbcn87teyq9Awj1%2F%2F%2F%2F%2F%2F%2F%2F%2F%2F8BEAIaDDA1OTAwMzU0Njg2NSIMnZcTRhbvMwF%2F5PA5KpEDdN%2FDI4V%2BNMDWQDFeAdUc99Lyxak%2B6vhAsfCBCf8hhvrRpalz75e74%2FXMAQwMN9m6i98o0Ljv9od7cuQEy8t%2B0DLzjzX5n3%2FxmpttowhMUm1jc8tBniLKBjwhTyiSHwhdeaVZf6x2zCJ0EIOWMNJHp3iFEqpaFvkRZbC1KWK4XPNNKo72HCvXuG7xmGrdHByz91AP7UgIYCy4hT10fnM43gbOE4wW8fqpgnvwCId%2F2u8k4rQoCLBqLYZzqshCRm1DBbsXCQhTwDXiMC2Ek3f63yKgw7rRCAxvs0vqirG%2B4mJ6LADaztAFMtKDPfnd4e%2B7%2FvnKU2NeotrqrkRgOkIAoFumbQXf20ky6mKWyHBk%2FxirVp60vUcLQpUm2Pcp6ythYxUi9IJxRGX8EF6aV4UHuCpUDUE7o8N84KUXIedUpytUZx7Xoxfk9w%2BR3%2FgX4LEHfkrWgiFAS3bVxNGOeV7GTwcXdcAggbdCaiAe46dfv7DDedx0KhVKOPH7obfvShqd6TYc0BjrV4sx61594ZJ3%2FO0ws7Lj8AU67AF17%2B1NZ3Ugu%2BwG9Ys9s7OxG8E4kBJ58vEY1yuBOQK9y2we4%2FTGPuqSxCuezqA%2BseslXYP%2FRc%2FZL9xx%2FUYaSjZhk1p1mhojxgBrckJYU7d8c4ELMPmtVy6R1yd2VDUoawEU8SB7nbNnMKzqQ3RgGgqGJiELys6dt%2FIr%2BVhpqM%2FZT4zadvzs8P%2FLoGzUHJKNZt0f99wLvZilphV92E%2BOUnwC4wbg3i3af3zozULwgEr7T%2FX2VsyREgexlzk76qMALPn0lgnciUyyQXxyUWAilXYQ0mQdXefh9lFfycczvt0UEuarX9p1sMwl8Ve5aw%3D%3D&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Date=20200110T210936Z&X-Amz-SignedHeaders=host&X-Amz-Expires=300&X-Amz-Credential=ASIAQ3PHCVTY23CMDBNC%2F20200110%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Signature=b43525576e1a0fdbab581481a3fe6db2862cbb2c69f2860b70cc8d444ccd73d5&hash=ccd128dfe597e704224bdfb4b3358de29b2be5d95887c71076bdab1236ba9e42&host=68042c943591013ac2b2430a89b270f6af2c76d8dfd086a07176afe7c76c2c61&pii=S2590109519300424&tid=spdf-74468ebd-6be6-43ac-b294-ced86e8eea58&sid=f9676d658285a749c46b6d081d965bb12aa8gxrqa&type=client"
+    m = SCIENCEDIRECT_BOUNCE_URL_REGEX.search(lines)
+    assert m.group(1) == url
diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py
index eb89a01..15f44f4 100644
--- a/python/sandcrawler/html_metadata.py
+++ b/python/sandcrawler/html_metadata.py
@@ -1,4 +1,5 @@
 
+import sys
 import datetime
 from typing import List, Optional, Any, Tuple, Dict
 import urllib.parse
@@ -212,6 +213,13 @@ XML_FULLTEXT_PATTERNS: List[dict] = [
         "attr": "href",
         "technique": "OJS Gallery XML link",
     },
+    {
+        "in_fulltext_url": "/download/xml/",
+        "selector": "a[title='XML']",
+        "attr": "href",
+        "technique": "ARPHA XML link",
+        "example_page": "https://zookeys.pensoft.net/article/26391",
+    },
 ]
 
 HTML_FULLTEXT_PATTERNS: List[dict] = [
@@ -241,17 +249,137 @@ HTML_FULLTEXT_PATTERNS: List[dict] = [
     },
 ]
 
+# This is a database of matching patterns. Most of these discovered by hand,
+# looking at OA journal content that failed to craw/ingest.
 PDF_FULLTEXT_PATTERNS: List[dict] = [
     {
-        "selector": "meta[name='citation_pdf_url']",
+        "selector": "head meta[name='citation_pdf_url']",
+        "attr": "content",
+        "technique": "citation_pdf_url",
+    },
+    {
+        "selector": "head meta[name='bepress_citation_pdf_url']",
         "attr": "content",
         "technique": "citation_pdf_url",
     },
     {
-        "selector": "meta[name='bepress_citation_pdf_url']",
+        "in_doc_url": "journals.lww.com",
+        "selector": "head meta[name='wkhealth_pdf_url']",
+        "attr": "content",
+        "technique": "wkhealth_pdf_url",
+        "example_page": "https://journals.lww.com/otainternational/Fulltext/2019/03011/Trauma_systems_in_North_America.2.aspx",
+    },
+    {
+        "selector": "head meta[propery='citation_pdf_url']",
         "attr": "content",
         "technique": "citation_pdf_url",
+        # eg, researchgate
+    },
+    {
+        "selector": "head meta[name='eprints.document_url']",
+        "attr": "content",
+        "technique": "citation_pdf_url (property)",
+    },
+    {
+        "in_doc_url": "/doi/10.",
+        "in_fulltext_url": "/doi/pdf/",
+        "selector": "a.show-pdf",
+        "attr": "href",
+        "technique": "SAGE/UTP show-pdflink",
+        "example_page": "https://journals.sagepub.com/doi/10.1177/2309499019888836",
+        # also http://utpjournals.press/doi/10.3138/cjh.ach.54.1-2.05
+    },
+    {
+        "in_doc_url": "/doi/10.",
+        "in_fulltext_url": "/doi/pdf/",
+        "selector": "a[title='PDF']",
+        "attr": "href",
+        "technique": "title=PDF link",
+        "example_page": "https://pubs.acs.org/doi/10.1021/acs.estlett.9b00379",
+    },
+    {
+        "in_doc_url": "/article/view/",
+        "selector": "a#pdfDownloadLink",
+        "attr": "href",
+        "technique": "pdfDownloadLink link",
+        "example_page": "http://www.revistas.unam.mx/index.php/rep/article/view/35503/32336",
+    },
+    {
+        "in_fulltext_url": "/pdf/",
+        "selector": "a.show-pdf",
+        "attr": "href",
+        "technique": "SAGE PDF link",
+        "example_page": "http://journals.sagepub.com/doi/pdf/10.1177/2309499019888836",
+    },
+    {
+        "in_doc_url": "://elifesciences.org/articles/",
+        "in_fulltext_url": "/download/",
+        "selector": "a[data-download-type='pdf-article']",
+        "attr": "href",
+        "technique": "eLife PDF link",
+        "example_page": "https://elifesciences.org/articles/59841",
+    },
+    {
+        "in_doc_url": "://www.jcancer.org/",
+        "in_fulltext_url": ".pdf",
+        "selector": ".divboxright a.text-button",
+        "attr": "href",
+        "technique": "jcancer PDF link",
+        "example_page": "https://www.jcancer.org/v10p4038.htm",
+    },
+    {
+        "in_doc_url": "://www.tandfonline.com/doi/full/10.",
+        "in_fulltext_url": "/pdf/",
+        "selector": "a.show-pdf",
+        "attr": "href",
+        "technique": "t+f show-pdf link",
+        "example_page": "https://www.tandfonline.com/doi/full/10.1080/19491247.2019.1682234",
+    },
+    {
+        "in_doc_url": "article_id=",
+        "in_fulltext_url": "download.php",
+        "selector": "a.file.pdf",
+        "attr": "href",
+        "technique": "pdf file link",
+        "example_page": "http://journals.tsu.ru/psychology/&journal_page=archive&id=1815&article_id=40405",
+    },
+    {
+        "in_doc_url": "/content/10.",
+        "in_fulltext_url": "pdf",
+        "selector": "a.pdf[title='Download']",
+        "attr": "href",
+        "technique": "pdf file link",
+        "example_page": "https://www.eurosurveillance.org/content/10.2807/1560-7917.ES.2020.25.11.2000230",
     },
+    {
+        "selector": "embed[type='application/pdf']",
+        "attr": "src",
+        "technique": "PDF embed",
+        "example_page": "http://www.jasstudies.com/DergiTamDetay.aspx?ID=3401",
+    },
+    {
+        "in_doc_url": "/html/",
+        "in_fulltext_url": "create_pdf",
+        "selector": ".AbsPdfFigTab img[src='images/pdf-icon.jpg'] + a",
+        "attr": "href",
+        "technique": "PDF URL link",
+        "example_page": "http://www.aed.org.cn/nyzyyhjxb/html/2018/4/20180408.htm",
+    },
+    {
+        "in_doc_url": "/archive-detail/",
+        "in_fulltext_url": ".pdf",
+        "selector": ".contact-list a.download-pdf",
+        "attr": "href",
+        "technique": "PDF URL link",
+        "example_page": "http://www.bezmialemscience.org/archives/archive-detail/article-preview/editorial/20439",
+    },
+]
+
+FULLTEXT_URL_PATTERNS_SKIP = [
+    # wiley has a weird almost-blank page we don't want to loop on
+    "://onlinelibrary.wiley.com/doi/pdf/"
+    "://doi.org/"
+    "://dx.doi.org/"
 ]
 
 RELEASE_TYPE_MAP = {
@@ -310,6 +438,7 @@ def html_extract_fulltext_url(doc_url: str, doc: HTMLParser, patterns: List[dict
 
     Returns null or a tuple of (url, technique)
     """
+    self_doc_url: Optional[Tuple[str, str]] = None
     for pattern in patterns:
         if not 'selector' in pattern:
             continue
@@ -321,13 +450,24 @@ def html_extract_fulltext_url(doc_url: str, doc: HTMLParser, patterns: List[dict
             continue
         if 'attr' in pattern:
             val = elem.attrs[pattern['attr']]
-            if val:
-                val = urllib.parse.urljoin(doc_url, val)
-                assert val
-                if 'in_fulltext_url' in pattern:
-                    if not pattern['in_fulltext_url'] in val:
-                        continue
-                return (val, pattern.get('technique', 'unknown'))
+            if not val:
+                continue
+            val = urllib.parse.urljoin(doc_url, val)
+            assert val
+            if 'in_fulltext_url' in pattern:
+                if not pattern['in_fulltext_url'] in val:
+                    continue
+            for skip_pattern in FULLTEXT_URL_PATTERNS_SKIP:
+                if skip_pattern in val.lower():
+                    continue
+            if url_fuzzy_equal(doc_url, val):
+                # don't link to self, unless no other options
+                self_doc_url = (val, pattern.get('technique', 'unknown'))
+                continue
+            return (val, pattern.get('technique', 'unknown'))
+    if self_doc_url:
+        print(f"  WARN: returning fulltext URL pointing to self", file=sys.stderr)
+        return self_doc_url
     return None
 
 def html_extract_biblio(doc_url: str, doc: HTMLParser) -> Optional[BiblioMetadata]:
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index 2f6be05..602f9c5 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -605,16 +605,14 @@ class IngestFileWorker(SandcrawlerWorker):
 
             if ingest_type == "pdf" and html_ish_resource:
 
-                fulltext_url = extract_fulltext_url(resource.terminal_url, resource.body)
-
                 # the new style of URL extraction (already computed)
-                # we aren't quite ready to adopt this for the PDF path (which
-                # has more complex logic to avoid loops, etc)
-                #if not fulltext_url and html_biblio and html_biblio.pdf_fulltext_url:
-                #    fulltext_url = dict(
-                #        pdf_url=html_biblio.pdf_fulltext_url,
-                #        technique="html_biblio",
-                #    )
+                if html_biblio and html_biblio.pdf_fulltext_url:
+                    fulltext_url = dict(
+                        pdf_url=html_biblio.pdf_fulltext_url,
+                        technique="html_biblio",
+                    )
+                else:
+                    fulltext_url = extract_fulltext_url(resource.terminal_url, resource.body)
 
                 result['extract_next_hop'] = fulltext_url
                 if not fulltext_url:
diff --git a/python/tests/test_html.py b/python/tests/test_html.py
index 7d58a39..9a81852 100644
--- a/python/tests/test_html.py
+++ b/python/tests/test_html.py
@@ -31,11 +31,3 @@ def test_extract_fulltext_url():
             f.read(),
         )
     assert resp['pdf_url'] == "https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0213978&type=printable"
-
-    with open('tests/files/elife_article.html', 'rb') as f:
-        resp = extract_fulltext_url(
-            "https://elifesciences.org/articles/44753",
-            f.read(),
-        )
-    assert resp['pdf_url'] == "https://elifesciences.org/download/aHR0cHM6Ly9jZG4uZWxpZmVzY2llbmNlcy5vcmcvYXJ0aWNsZXMvNDQ3NTMvZWxpZmUtNDQ3NTMtdjIucGRm/elife-44753-v2.pdf?_hash=CfyqOqVryCR4OjcMTfcdpeIWAGZznmh9jXksYKYChCw%3D"
-
diff --git a/python/tests/test_html_metadata.py b/python/tests/test_html_metadata.py
index b428b0d..bf26a98 100644
--- a/python/tests/test_html_metadata.py
+++ b/python/tests/test_html_metadata.py
@@ -39,6 +39,7 @@ def test_html_metadata_plos() -> None:
     assert meta.publisher == "Public Library of Science"
     assert meta.raw_references and "citation_title=Reticuloendotheliosis virus sequences within the genomes of field strains of fowlpox virus display variability;citation_author=P Singh;citation_author=W. M. Schnitzlein;citation_author=D. N. Tripathy;citation_journal_title=J. Virol;citation_volume=77;citation_number=77;citation_first_page=5855;citation_last_page=5862;citation_publication_date=2003;" in meta.raw_references
     assert meta.release_type == "article-journal"
+    assert meta.pdf_fulltext_url == "https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0213978&type=printable"
 
 
 def test_html_metadata_elife() -> None:
@@ -46,7 +47,7 @@ def test_html_metadata_elife() -> None:
     with open('tests/files/elife_article.html', 'r') as f:
         elife_html = f.read()
 
-    meta = html_extract_biblio("http://example.org", HTMLParser(elife_html))
+    meta = html_extract_biblio("https://elifesciences.org/articles/44753", HTMLParser(elife_html))
     assert meta is not None
     assert meta.title == "Parallel visual circuitry in a basal chordate"
     assert meta.doi == "10.7554/eLife.44753"
@@ -63,6 +64,7 @@ def test_html_metadata_elife() -> None:
     # 2019-04-18
     assert meta.release_date == datetime.date(year=2019, month=4, day=18)
     assert meta.publisher == "eLife Sciences Publications Limited"
+    assert meta.pdf_fulltext_url == "https://elifesciences.org/download/aHR0cHM6Ly9jZG4uZWxpZmVzY2llbmNlcy5vcmcvYXJ0aWNsZXMvNDQ3NTMvZWxpZmUtNDQ3NTMtdjIucGRm/elife-44753-v2.pdf?_hash=CfyqOqVryCR4OjcMTfcdpeIWAGZznmh9jXksYKYChCw%3D"
 
 
 def test_html_metadata_peerj() -> None:
author	Bryan Newbold <bnewbold@archive.org>	2020-11-08 21:54:24 -0800
committer	Bryan Newbold <bnewbold@archive.org>	2020-11-08 21:54:24 -0800
commit	a68aadc4107fc68dc2748c52dab8a4bd92cca022 (patch)
tree	da3da0a847d5c10dee873e8bce8198a39c12ce1f /python
parent	6a701f966b8bc760bf904c0569562b0159e13559 (diff)
download	sandcrawler-a68aadc4107fc68dc2748c52dab8a4bd92cca022.tar.gz sandcrawler-a68aadc4107fc68dc2748c52dab8a4bd92cca022.zip