aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--python/sandcrawler/html_ingest.py8
1 files changed, 8 insertions, 0 deletions
diff --git a/python/sandcrawler/html_ingest.py b/python/sandcrawler/html_ingest.py
index 91b9cd6..115e7b5 100644
--- a/python/sandcrawler/html_ingest.py
+++ b/python/sandcrawler/html_ingest.py
@@ -236,6 +236,7 @@ def html_guess_scope(url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata]
- component
- issue-fulltext
- landingpage
+ - homepage-domain
- blocked-paywall
- blocked-login
- blocked-captcha
@@ -249,6 +250,9 @@ def html_guess_scope(url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata]
fulltext or a landing page, but could be one of the other categories.
"""
+ # assert that this is a real URL
+ assert url.count('/') >= 2
+
# basic paywall and loginwall detection based on URL
if url.endswith("/cookieAbsent"):
return "blocked-cookie"
@@ -264,6 +268,10 @@ def html_guess_scope(url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata]
if "showcaptcha.asp" in url:
return "blocked-captcha"
+ # is this the top-level URL of the domain? aka, no path?
+ if url.count('/') <= 2 or (url.count('/') == 3) and url.endswith('/'):
+ return "homepage-domain"
+
platform = html_guess_platform(url, doc, biblio)
if biblio: