aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2022-07-20 18:03:12 -0700
committerBryan Newbold <bnewbold@archive.org>2022-07-20 18:03:12 -0700
commit98b95dea4eafec78f16f6afbabfe65aa2489e78f (patch)
tree2aa4ff3337a4315dbdf5cbf84b086cc14235dc8c
parenta72019e6e788be64420719c5045e40614098c106 (diff)
downloadsandcrawler-98b95dea4eafec78f16f6afbabfe65aa2489e78f.tar.gz
sandcrawler-98b95dea4eafec78f16f6afbabfe65aa2489e78f.zip
ingest: more PDF fulltext URL patterns
-rw-r--r--python/sandcrawler/html_metadata.py42
1 files changed, 42 insertions, 0 deletions
diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py
index 114d9a1..e5a1640 100644
--- a/python/sandcrawler/html_metadata.py
+++ b/python/sandcrawler/html_metadata.py
@@ -646,6 +646,45 @@ PDF_FULLTEXT_PATTERNS: List[Dict[str, str]] = [
"attr": "href",
"technique": "erciyesmedj.com publication system PDF download link",
},
+ {
+ "selector": "body embed[alt='pdf']",
+ "attr": "src",
+ "technique": "embed PDF",
+ "example_pdf": "https://www.arkat-usa.org/arkivoc-journal/browse-arkivoc/ark.5550190.0006.913",
+ },
+ {
+ "in_fulltext_url": "viewPDFInterstitial",
+ "in_doc_url": "/article/view/",
+ "selector": "frameset frame",
+ "attr": "src",
+ "technique": "PDF iframe (viewPDFInterstitial)",
+ "example_page": "http://revistaadmmade.estacio.br/index.php/reeduc/article/view/1910/47965873",
+ },
+ {
+ # note this one has a special handler
+ "in_doc_url": "viewPDFInterstitial",
+ "in_fulltext_url": "://",
+ "selector": "head meta[http-equiv='refresh']",
+ "attr": "content",
+ "technique": "HTML meta refresh (viewPDFInterstitial)",
+ "example_page": "http://revistaadmmade.estacio.br/index.php/reeduc/article/view/1910/47965873",
+ },
+ {
+ "in_doc_url": "dlib.si/details/",
+ "in_fulltext_url": "PDF",
+ "selector": "body #FilesBox a",
+ "attr": "href",
+ "technique": "dlib.si download links",
+ "example_page": "https://www.dlib.si/details/URN:NBN:SI:DOC-WR9GTSCJ",
+ },
+ {
+ "in_doc_url": "filclass.ru",
+ "in_fulltext_url": "pdf",
+ "selector": "main .pdf-article a.pdficon",
+ "attr": "href",
+ "technique": "filclass.ru PDF link",
+ "example_page": "https://filclass.ru/en/archive/2018/2-52/the-chronicle-of-domestic-literary-criticism",
+ },
]
FULLTEXT_URL_PATTERNS_SKIP: List[str] = [
@@ -731,6 +770,9 @@ def html_extract_fulltext_url(
val = None
if "attr" in pattern:
val = elem.attrs.get(pattern["attr"])
+ # handle HTML redirect
+ if val and pattern["attr"] == "content" and "URL=" in val:
+ val = val.split("URL=")[1]
elif pattern.get("use_body"):
val = elem.text()
if "://" not in val: