diff options
author | Bryan Newbold <bnewbold@archive.org> | 2022-10-24 18:35:02 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2022-10-24 18:35:04 -0700 |
commit | a90b604c189bc5655d4a050a9241dfe0b34dbc5b (patch) | |
tree | cc4552d4458d54116c505656cef8c6d1d824dfce | |
parent | d8f82f5836004d394a419574c50f0636369c94d7 (diff) | |
download | sandcrawler-a90b604c189bc5655d4a050a9241dfe0b34dbc5b.tar.gz sandcrawler-a90b604c189bc5655d4a050a9241dfe0b34dbc5b.zip |
ingest: more generic OJS support, including pre-prints
There were some '/article/view/' patterns which can also be, eg,
'/preprint/view/'.
-rw-r--r-- | python/sandcrawler/html_metadata.py | 28 |
1 files changed, 22 insertions, 6 deletions
diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py index 9773615..1e2d197 100644 --- a/python/sandcrawler/html_metadata.py +++ b/python/sandcrawler/html_metadata.py @@ -207,7 +207,7 @@ XML_FULLTEXT_PATTERNS: List[Dict[str, str]] = [ "technique": "SciElo XML link", }, { - "in_doc_url": "/article/view/", + "in_doc_url": "/view/", "in_fulltext_url": "viewXML", "selector": "a[class='obj_galley_link']", "attr": "href", @@ -331,7 +331,7 @@ PDF_FULLTEXT_PATTERNS: List[Dict[str, str]] = [ "example_page": "https://pubs.acs.org/doi/10.1021/acs.estlett.9b00379", }, { - "in_doc_url": "/article/view/", + "in_doc_url": "/view/", "selector": "a#pdfDownloadLink", "attr": "href", "technique": "OJS pdfDownloadLink link", @@ -612,8 +612,8 @@ PDF_FULLTEXT_PATTERNS: List[Dict[str, str]] = [ "example_page": "https://integrityresjournals.org/journal/JBBD/article-abstract/750B649A1", }, { - "in_doc_url": "/article/view/", - "in_fulltext_url": "/article/download/", + "in_doc_url": "/view/", + "in_fulltext_url": "/download/", "selector": "body.pkp_page_article a.download", "attr": "href", "technique": "OJS PDF Embed", @@ -654,7 +654,7 @@ PDF_FULLTEXT_PATTERNS: List[Dict[str, str]] = [ }, { "in_fulltext_url": "viewPDFInterstitial", - "in_doc_url": "/article/view/", + "in_doc_url": "/view/", "selector": "frameset frame", "attr": "src", "technique": "PDF iframe (viewPDFInterstitial)", @@ -702,7 +702,7 @@ PDF_FULLTEXT_PATTERNS: List[Dict[str, str]] = [ "example_page": "https://www.grrjournal.com/article/analysis-of-audiences-uses-and-gratifications-in-the-selected-pakistani-urdu-films", }, { - "in_doc_url": "/article/view/", + "in_doc_url": "/view/", "in_fulltext_url": "pdf", "selector": "#articleFullText a.remote_pdf", "attr": "href", @@ -739,6 +739,22 @@ PDF_FULLTEXT_PATTERNS: List[Dict[str, str]] = [ "technique": "generic download article button", "example_page": "https://plit-periodical.com.ua/arhiv/struktura-ta-vlastyvosti-materialu-zrazkiv-vyroshchenyh-metodom-selektyvnogo-lazernogo", }, + { + "in_doc_url": "/view/", + "in_fulltext_url": "/view/", + "selector": "body .entry_details a.pdf", + "attr": "href", + "technique": "generic OJS/preprints", + "example_page": "https://preprints.scielo.org/index.php/scielo/preprint/view/4729/version/5022", + }, + { + "in_doc_url": "/view/", + "in_fulltext_url": "/download/", + "selector": "body header a.download", + "attr": "href", + "technique": "generic OJS/preprints PDF Embed", + "example_page": "https://preprints.scielo.org/index.php/scielo/preprint/view/4729/9327", + }, ] FULLTEXT_URL_PATTERNS_SKIP: List[str] = [ |