From 2ebef36c083b59d158fae7098da49bf972141f1c Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 3 Sep 2021 10:37:37 -0700 Subject: HTML ingest: several more PDF fulltext URL patterns --- python/sandcrawler/html_metadata.py | 87 +++++++++++++++++++++++++++++++++++++ 1 file changed, 87 insertions(+) (limited to 'python/sandcrawler') diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py index abcc428..871be32 100644 --- a/python/sandcrawler/html_metadata.py +++ b/python/sandcrawler/html_metadata.py @@ -437,6 +437,93 @@ PDF_FULLTEXT_PATTERNS: List[dict] = [ "technique": "PDF URL link", "example_page": "https://www.ingentaconnect.com/content/ista/sst/2021/00000049/00000001/art00007", }, + { + "in_doc_url": "library.wur.nl/", + "in_fulltext_url": "pdf", + "selector": "a.wl_full_text_restricted", + "attr": "href", + "technique": "PDF URL link", + "example_page": "https://library.wur.nl/WebQuery/wurpubs/529922", + }, + { + "in_doc_url": "/dlibra/", + "in_fulltext_url": "pdf", + "selector": "iframe#js-main-frame", + "attr": "src", + "technique": "PDF iframe (dlibra)", + "example_page": "https://dbc.wroc.pl/dlibra/docmetadata?showContent=true&id=41031", + }, + { + "in_doc_url": "/handle/", + "in_fulltext_url": "pdf", + "selector": "table.misc table.inner tr.b a", + "attr": "href", + "technique": "PDF URL link (DSpace, first file)", + "example_page": "https://orbi.uliege.be/handle/2268/174200", + }, + { + "in_doc_url": "/publications/", + "in_fulltext_url": "pdf", + "selector": ".publication-sidebar li.open-access a.document-link", + "attr": "href", + "technique": "PDF URL link (Pure repo, OA link)", + "example_page": "https://research.tue.nl/en/publications/lowering-the-threshold-for-computers-in-early-design-some-advance", + }, + { + "in_doc_url": "//hal", + "selector": ".widget-openaccess .widget-content a", + "attr": "href", + "technique": "Fulltext OA URL (HAL)", + "example_page": "https://hal.archives-ouvertes.fr/hal-00744951", + }, + { + "in_doc_url": "/record/", + "in_fulltext_url": "pdf", + "selector": "#detailedrecordminipanelfile a", + "attr": "href", + "technique": "PDF URL link (Invenio)", + "example_page": "https://bib-pubdb1.desy.de/record/416556", + }, + { + "in_doc_url": "/available/", + "in_fulltext_url": "pdf", + "selector": "table.file-table a", + "attr": "href", + "technique": "PDF URL link", + "example_page": "https://etd.adm.unipi.it/theses/available/etd-05302014-183910/", + }, + { + "in_doc_url": "/islandora/", + "in_fulltext_url": "pdf", + "selector": "a.islandora-pdf-link", + "attr": "href", + "technique": "PDF URL link (Islandora)", + "example_page": "http://fau.digital.flvc.org/islandora/object/fau%3A9804", + }, + { + "in_doc_url": "/receive/", + "in_fulltext_url": "pdf", + "selector": ".mir-preview noscript a", + "attr": "href", + "technique": "PDF iframe via noscript (MyCoRe)", + "example_page": "https://www.db-thueringen.de/receive/dbt_mods_00005191", + }, + { + "in_doc_url": "/registro.do", + "in_fulltext_url": "imagenes", + "selector": ".resumen_bib a[data-analytics=media]", + "attr": "href", + "technique": "Media link (DIGIBIS)", + "example_page": "https://bivaldi.gva.es/es/consulta/registro.do?id=11740", + }, + { + "in_doc_url": "/view", + "in_fulltext_url": "/at_download/", + "selector": ".documentContent #content a", + "attr": "href", + "technique": "Media link (Plone)", + "example_page": "http://xjornadaslc.fahce.unlp.edu.ar/actas/Ramon_Esteban_Chaparro.pdf/view", + }, ] FULLTEXT_URL_PATTERNS_SKIP = [ -- cgit v1.2.3