rwth-aachen.de HTML extract, and a generic URL guess method

author: Bryan Newbold <bnewbold@archive.org> 2020-08-08 16:00:36 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2020-08-08 16:55:08 -0700
commit: c19b73f13b021a6d3026d0526b7dfa7a9fdda3a6 (patch)
tree: f3c593557fbce0e712b1e8d98f0ddcf663da9a4f
parent: 0aa723392c1c72a354731aa21c06c55adeacab30 (diff)
download: sandcrawler-c19b73f13b021a6d3026d0526b7dfa7a9fdda3a6.tar.gz
sandcrawler-c19b73f13b021a6d3026d0526b7dfa7a9fdda3a6.zip
1 files changed, 15 insertions, 0 deletions
diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py
index 88ea41b..0e64c45 100644
--- a/python/sandcrawler/html.py
+++ b/python/sandcrawler/html.py
@@ -338,4 +338,19 @@ def extract_fulltext_url(html_url, html_body):
                 url = host_prefix + url
             return dict(pdf_url=url, technique='cnki-href')
 
+    # RWTH AACHEN repository
+    if '://publications.rwth-aachen.de/record/' in html_url:
+        record_id = html_url.split('/')[-1]
+        url = f"{html_url}/files/{record_id}.pdf"
+        if record_id.isdigit() and url.encode('utf-8') in html_body:
+            return dict(pdf_url=url, technique='rwth-aachen-url')
+
+    ### below here we are doing guesses
+
+    # generic guess: try current URL plus .pdf, if it exists in the HTML body
+    if not '.pdf' in html_url:
+        url = html_url + ".pdf"
+        if url.encode('utf-8') in html_body:
+            return dict(pdf_url=url, technique='guess-url-plus-pdf')
+
     return dict()
author	Bryan Newbold <bnewbold@archive.org>	2020-08-08 16:00:36 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2020-08-08 16:55:08 -0700
commit	c19b73f13b021a6d3026d0526b7dfa7a9fdda3a6 (patch)
tree	f3c593557fbce0e712b1e8d98f0ddcf663da9a4f
parent	0aa723392c1c72a354731aa21c06c55adeacab30 (diff)
download	sandcrawler-c19b73f13b021a6d3026d0526b7dfa7a9fdda3a6.tar.gz sandcrawler-c19b73f13b021a6d3026d0526b7dfa7a9fdda3a6.zip