diff options
author | Bryan Newbold <bnewbold@archive.org> | 2022-07-20 18:03:12 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2022-07-20 18:03:12 -0700 |
commit | 98b95dea4eafec78f16f6afbabfe65aa2489e78f (patch) | |
tree | 2aa4ff3337a4315dbdf5cbf84b086cc14235dc8c /python | |
parent | a72019e6e788be64420719c5045e40614098c106 (diff) | |
download | sandcrawler-98b95dea4eafec78f16f6afbabfe65aa2489e78f.tar.gz sandcrawler-98b95dea4eafec78f16f6afbabfe65aa2489e78f.zip |
ingest: more PDF fulltext URL patterns
Diffstat (limited to 'python')
-rw-r--r-- | python/sandcrawler/html_metadata.py | 42 |
1 files changed, 42 insertions, 0 deletions
diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py index 114d9a1..e5a1640 100644 --- a/python/sandcrawler/html_metadata.py +++ b/python/sandcrawler/html_metadata.py @@ -646,6 +646,45 @@ PDF_FULLTEXT_PATTERNS: List[Dict[str, str]] = [ "attr": "href", "technique": "erciyesmedj.com publication system PDF download link", }, + { + "selector": "body embed[alt='pdf']", + "attr": "src", + "technique": "embed PDF", + "example_pdf": "https://www.arkat-usa.org/arkivoc-journal/browse-arkivoc/ark.5550190.0006.913", + }, + { + "in_fulltext_url": "viewPDFInterstitial", + "in_doc_url": "/article/view/", + "selector": "frameset frame", + "attr": "src", + "technique": "PDF iframe (viewPDFInterstitial)", + "example_page": "http://revistaadmmade.estacio.br/index.php/reeduc/article/view/1910/47965873", + }, + { + # note this one has a special handler + "in_doc_url": "viewPDFInterstitial", + "in_fulltext_url": "://", + "selector": "head meta[http-equiv='refresh']", + "attr": "content", + "technique": "HTML meta refresh (viewPDFInterstitial)", + "example_page": "http://revistaadmmade.estacio.br/index.php/reeduc/article/view/1910/47965873", + }, + { + "in_doc_url": "dlib.si/details/", + "in_fulltext_url": "PDF", + "selector": "body #FilesBox a", + "attr": "href", + "technique": "dlib.si download links", + "example_page": "https://www.dlib.si/details/URN:NBN:SI:DOC-WR9GTSCJ", + }, + { + "in_doc_url": "filclass.ru", + "in_fulltext_url": "pdf", + "selector": "main .pdf-article a.pdficon", + "attr": "href", + "technique": "filclass.ru PDF link", + "example_page": "https://filclass.ru/en/archive/2018/2-52/the-chronicle-of-domestic-literary-criticism", + }, ] FULLTEXT_URL_PATTERNS_SKIP: List[str] = [ @@ -731,6 +770,9 @@ def html_extract_fulltext_url( val = None if "attr" in pattern: val = elem.attrs.get(pattern["attr"]) + # handle HTML redirect + if val and pattern["attr"] == "content" and "URL=" in val: + val = val.split("URL=")[1] elif pattern.get("use_body"): val = elem.text() if "://" not in val: |