ingest: eurosurveillance PDF parser

author: Bryan Newbold <bnewbold@archive.org> 2020-03-25 16:29:12 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2020-03-25 16:29:12 -0700
commit: b91987978330b6da1c6bb5650418c2c6068340a1 (patch)
tree: f95415a7355724988d9bfd37b0abe33c8636faa2
parent: 11f7c22e1de104918d32274feaea310c11476cc7 (diff)
download: sandcrawler-b91987978330b6da1c6bb5650418c2c6068340a1.tar.gz
sandcrawler-b91987978330b6da1c6bb5650418c2c6068340a1.zip
1 files changed, 11 insertions, 0 deletions
diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py
index c76d7a2..04b3afe 100644
--- a/python/sandcrawler/html.py
+++ b/python/sandcrawler/html.py
@@ -310,4 +310,15 @@ def extract_fulltext_url(html_url, html_body):
             if url and url.startswith('http'):
                 return dict(pdf_url=url, technique='figshare-json')
 
+    # eurosurveillance
+    # https://www.eurosurveillance.org/content/10.2807/1560-7917.ES.2020.25.11.2000230
+    if "://www.eurosurveillance.org/content/" in html_url:
+        # <a href="/deliver/fulltext/eurosurveillance/25/11/eurosurv-25-11-3.pdf?itemId=/content/10.2807/1560-7917.ES.2020.25.11.2000230&mimeType=pdf&containerItemId=content/eurosurveillance" class="pdf " title="Download" rel="http://instance.metastore.ingenta.com/content/10.2807/1560-7917.ES.2020.25.11.2000230" target="/content/10.2807/1560-7917.ES.2020.25.11.2000230-pdf" >
+        href = soup.find('a', attrs={"class":"pdf", "title": "Download"})
+        if href:
+            url = href['href'].strip()
+            if not url.startswith('http'):
+                url = host_prefix + url
+            return dict(pdf_url=url, technique='eurosurveillance-href')
+
     return dict()
author	Bryan Newbold <bnewbold@archive.org>	2020-03-25 16:29:12 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2020-03-25 16:29:12 -0700
commit	b91987978330b6da1c6bb5650418c2c6068340a1 (patch)
tree	f95415a7355724988d9bfd37b0abe33c8636faa2
parent	11f7c22e1de104918d32274feaea310c11476cc7 (diff)
download	sandcrawler-b91987978330b6da1c6bb5650418c2c6068340a1.tar.gz sandcrawler-b91987978330b6da1c6bb5650418c2c6068340a1.zip