aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2022-07-12 15:52:51 -0700
committerBryan Newbold <bnewbold@archive.org>2022-07-12 15:52:51 -0700
commit0d5f6468960a7bbf7a499654feaaa5ecab7d9130 (patch)
tree800195bfaf9cdfc0d37d5299231416ebda3cf29a
parent8f85ab294eae50e31efa9e31bb0bca1bca76cf8b (diff)
downloadsandcrawler-0d5f6468960a7bbf7a499654feaaa5ecab7d9130.tar.gz
sandcrawler-0d5f6468960a7bbf7a499654feaaa5ecab7d9130.zip
ingest: doaj.org article landing page access links
-rw-r--r--python/sandcrawler/html_metadata.py12
-rw-r--r--python/sandcrawler/ingest_file.py1
2 files changed, 12 insertions, 1 deletions
diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py
index 2fb500c..edaf89f 100644
--- a/python/sandcrawler/html_metadata.py
+++ b/python/sandcrawler/html_metadata.py
@@ -255,6 +255,12 @@ HTML_FULLTEXT_PATTERNS: List[Dict[str, str]] = [
"attr": "href",
"technique": "dovepress fulltext link",
},
+ {
+ "in_doc_url": "://doaj.org/article/",
+ "selector": "section.col-md-8 a[target='_blank'].button--primary",
+ "attr": "href",
+ "technique": "doaj.org access link",
+ },
]
COMPONENT_FULLTEXT_PATTERNS: List[Dict[str, str]] = [
@@ -627,6 +633,12 @@ PDF_FULLTEXT_PATTERNS: List[Dict[str, str]] = [
"attr": "href",
"technique": "PDF link (scitemed.com)",
},
+ {
+ "in_doc_url": "://doaj.org/article/",
+ "selector": "section.col-md-8 a[target='_blank'].button--primary",
+ "attr": "href",
+ "technique": "doaj.org access link",
+ },
]
FULLTEXT_URL_PATTERNS_SKIP: List[str] = [
diff --git a/python/sandcrawler/ingest_file.py b/python/sandcrawler/ingest_file.py
index c79973f..25ae7d2 100644
--- a/python/sandcrawler/ingest_file.py
+++ b/python/sandcrawler/ingest_file.py
@@ -128,7 +128,6 @@ class IngestFileWorker(SandcrawlerWorker):
"://www.openlibrary.org/",
"://fatcat.wiki/",
"://orcid.org/",
- "://doaj.org/",
# Domain squats
"://bartandjones.com",
"://ijretm.com",