diff options
Diffstat (limited to 'python')
-rw-r--r-- | python/sandcrawler/html.py | 15 |
1 files changed, 15 insertions, 0 deletions
diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py index 88ea41b..0e64c45 100644 --- a/python/sandcrawler/html.py +++ b/python/sandcrawler/html.py @@ -338,4 +338,19 @@ def extract_fulltext_url(html_url, html_body): url = host_prefix + url return dict(pdf_url=url, technique='cnki-href') + # RWTH AACHEN repository + if '://publications.rwth-aachen.de/record/' in html_url: + record_id = html_url.split('/')[-1] + url = f"{html_url}/files/{record_id}.pdf" + if record_id.isdigit() and url.encode('utf-8') in html_body: + return dict(pdf_url=url, technique='rwth-aachen-url') + + ### below here we are doing guesses + + # generic guess: try current URL plus .pdf, if it exists in the HTML body + if not '.pdf' in html_url: + url = html_url + ".pdf" + if url.encode('utf-8') in html_body: + return dict(pdf_url=url, technique='guess-url-plus-pdf') + return dict() |