From 966df43c77581770df4d83d37afe8ead41d51abb Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 20 Jul 2022 20:41:32 -0700 Subject: ingest: more PDF fulltext tricks --- python/sandcrawler/html.py | 7 +++++++ python/sandcrawler/html_metadata.py | 29 +++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+) (limited to 'python/sandcrawler') diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py index 73c808c..207f067 100644 --- a/python/sandcrawler/html.py +++ b/python/sandcrawler/html.py @@ -323,6 +323,13 @@ def extract_fulltext_url(html_url: str, html_body: bytes) -> Dict[str, str]: technique="google-drive", ) + # https://doi.org/10.24850/j-tyca-14-4-7 + # https://docs.google.com/viewer?url=http://revistatyca.org.mx/index.php/tyca/libraryFiles/downloadPublic/150 + if "docs.google.com/viewer?url=" in html_url: + original_url = html_url.split("?url=")[1] + if original_url: + return dict(pdf_url=original_url, technique="docs.google.com viewer") + ### below here we are doing guesses # generic guess: try current URL plus .pdf, if it exists in the HTML body diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py index e5a1640..3d9e8ca 100644 --- a/python/sandcrawler/html_metadata.py +++ b/python/sandcrawler/html_metadata.py @@ -685,6 +685,30 @@ PDF_FULLTEXT_PATTERNS: List[Dict[str, str]] = [ "technique": "filclass.ru PDF link", "example_page": "https://filclass.ru/en/archive/2018/2-52/the-chronicle-of-domestic-literary-criticism", }, + { + "in_doc_url": "cdnsciencepub.com", + "in_fulltext_url": "pdf", + "selector": "article .info-panel a.btn--pdf", + "attr": "href", + "technique": "cdnsciencepub.com PDF link", + "example_page": "https://cdnsciencepub.com/doi/10.1139/AS-2022-0011", + }, + { + "in_doc_url": "grrjournal.com", + "in_fulltext_url": "pdf", + "selector": ".ereaders-main-section a[download]", + "attr": "href", + "technique": "grrjournal.com PDF link", + "example_page": "https://www.grrjournal.com/article/analysis-of-audiences-uses-and-gratifications-in-the-selected-pakistani-urdu-films", + }, + { + "in_doc_url": "/article/view/", + "in_fulltext_url": "pdf", + "selector": "#articleFullText a.remote_pdf", + "attr": "href", + "technique": "OJS remote_pdf link", + "example_page": "https://www.mediterranea-comunicacion.org/article/view/22240", + }, ] FULLTEXT_URL_PATTERNS_SKIP: List[str] = [ @@ -801,6 +825,11 @@ def html_extract_fulltext_url( # don't link to self, unless no other options self_doc_url = (val, pattern.get("technique", "unknown")) continue + + # quirks modes / hacks + if "drops.dagstuhl.de" in doc_url and val.endswith(".pdf/"): + val = val[:-1] + return (val, pattern.get("technique", "unknown")) if self_doc_url: print(" WARN: returning fulltext URL pointing to self", file=sys.stderr) -- cgit v1.2.3