From a90b604c189bc5655d4a050a9241dfe0b34dbc5b Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 24 Oct 2022 18:35:02 -0700 Subject: ingest: more generic OJS support, including pre-prints There were some '/article/view/' patterns which can also be, eg, '/preprint/view/'. --- python/sandcrawler/html_metadata.py | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) (limited to 'python/sandcrawler') diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py index 9773615..1e2d197 100644 --- a/python/sandcrawler/html_metadata.py +++ b/python/sandcrawler/html_metadata.py @@ -207,7 +207,7 @@ XML_FULLTEXT_PATTERNS: List[Dict[str, str]] = [ "technique": "SciElo XML link", }, { - "in_doc_url": "/article/view/", + "in_doc_url": "/view/", "in_fulltext_url": "viewXML", "selector": "a[class='obj_galley_link']", "attr": "href", @@ -331,7 +331,7 @@ PDF_FULLTEXT_PATTERNS: List[Dict[str, str]] = [ "example_page": "https://pubs.acs.org/doi/10.1021/acs.estlett.9b00379", }, { - "in_doc_url": "/article/view/", + "in_doc_url": "/view/", "selector": "a#pdfDownloadLink", "attr": "href", "technique": "OJS pdfDownloadLink link", @@ -612,8 +612,8 @@ PDF_FULLTEXT_PATTERNS: List[Dict[str, str]] = [ "example_page": "https://integrityresjournals.org/journal/JBBD/article-abstract/750B649A1", }, { - "in_doc_url": "/article/view/", - "in_fulltext_url": "/article/download/", + "in_doc_url": "/view/", + "in_fulltext_url": "/download/", "selector": "body.pkp_page_article a.download", "attr": "href", "technique": "OJS PDF Embed", @@ -654,7 +654,7 @@ PDF_FULLTEXT_PATTERNS: List[Dict[str, str]] = [ }, { "in_fulltext_url": "viewPDFInterstitial", - "in_doc_url": "/article/view/", + "in_doc_url": "/view/", "selector": "frameset frame", "attr": "src", "technique": "PDF iframe (viewPDFInterstitial)", @@ -702,7 +702,7 @@ PDF_FULLTEXT_PATTERNS: List[Dict[str, str]] = [ "example_page": "https://www.grrjournal.com/article/analysis-of-audiences-uses-and-gratifications-in-the-selected-pakistani-urdu-films", }, { - "in_doc_url": "/article/view/", + "in_doc_url": "/view/", "in_fulltext_url": "pdf", "selector": "#articleFullText a.remote_pdf", "attr": "href", @@ -739,6 +739,22 @@ PDF_FULLTEXT_PATTERNS: List[Dict[str, str]] = [ "technique": "generic download article button", "example_page": "https://plit-periodical.com.ua/arhiv/struktura-ta-vlastyvosti-materialu-zrazkiv-vyroshchenyh-metodom-selektyvnogo-lazernogo", }, + { + "in_doc_url": "/view/", + "in_fulltext_url": "/view/", + "selector": "body .entry_details a.pdf", + "attr": "href", + "technique": "generic OJS/preprints", + "example_page": "https://preprints.scielo.org/index.php/scielo/preprint/view/4729/version/5022", + }, + { + "in_doc_url": "/view/", + "in_fulltext_url": "/download/", + "selector": "body header a.download", + "attr": "href", + "technique": "generic OJS/preprints PDF Embed", + "example_page": "https://preprints.scielo.org/index.php/scielo/preprint/view/4729/9327", + }, ] FULLTEXT_URL_PATTERNS_SKIP: List[str] = [ -- cgit v1.2.3