html ingest: detect domain homepage (no path) as special case

author: Bryan Newbold <bnewbold@archive.org> 2021-08-16 20:16:55 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2021-08-16 20:16:55 -0700
commit: 26d90505bda2d1dfcc25af6b8a0270faa11729e7 (patch)
tree: 9225b9206b34e6641be6438b6e16af99602309a7 /python
parent: 8ccf0e6e935f169d3d6d35da36c767ebf0a4637a (diff)
download: sandcrawler-26d90505bda2d1dfcc25af6b8a0270faa11729e7.tar.gz
sandcrawler-26d90505bda2d1dfcc25af6b8a0270faa11729e7.zip
1 files changed, 8 insertions, 0 deletions
diff --git a/python/sandcrawler/html_ingest.py b/python/sandcrawler/html_ingest.py
index 91b9cd6..115e7b5 100644
--- a/python/sandcrawler/html_ingest.py
+++ b/python/sandcrawler/html_ingest.py
@@ -236,6 +236,7 @@ def html_guess_scope(url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata]
     - component
     - issue-fulltext
     - landingpage
+    - homepage-domain
     - blocked-paywall
     - blocked-login
     - blocked-captcha
@@ -249,6 +250,9 @@ def html_guess_scope(url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata]
     fulltext or a landing page, but could be one of the other categories.
     """
 
+    # assert that this is a real URL
+    assert url.count('/') >= 2
+
     # basic paywall and loginwall detection based on URL
     if url.endswith("/cookieAbsent"):
         return "blocked-cookie"
@@ -264,6 +268,10 @@ def html_guess_scope(url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata]
     if "showcaptcha.asp" in url:
         return "blocked-captcha"
 
+    # is this the top-level URL of the domain? aka, no path?
+    if url.count('/') <= 2 or (url.count('/') == 3) and url.endswith('/'):
+        return "homepage-domain"
+
     platform = html_guess_platform(url, doc, biblio)
 
     if biblio:
author	Bryan Newbold <bnewbold@archive.org>	2021-08-16 20:16:55 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2021-08-16 20:16:55 -0700
commit	26d90505bda2d1dfcc25af6b8a0270faa11729e7 (patch)
tree	9225b9206b34e6641be6438b6e16af99602309a7 /python
parent	8ccf0e6e935f169d3d6d35da36c767ebf0a4637a (diff)
download	sandcrawler-26d90505bda2d1dfcc25af6b8a0270faa11729e7.tar.gz sandcrawler-26d90505bda2d1dfcc25af6b8a0270faa11729e7.zip