diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-08-16 20:16:55 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-08-16 20:16:55 -0700 |
commit | 26d90505bda2d1dfcc25af6b8a0270faa11729e7 (patch) | |
tree | 9225b9206b34e6641be6438b6e16af99602309a7 | |
parent | 8ccf0e6e935f169d3d6d35da36c767ebf0a4637a (diff) | |
download | sandcrawler-26d90505bda2d1dfcc25af6b8a0270faa11729e7.tar.gz sandcrawler-26d90505bda2d1dfcc25af6b8a0270faa11729e7.zip |
html ingest: detect domain homepage (no path) as special case
-rw-r--r-- | python/sandcrawler/html_ingest.py | 8 |
1 files changed, 8 insertions, 0 deletions
diff --git a/python/sandcrawler/html_ingest.py b/python/sandcrawler/html_ingest.py index 91b9cd6..115e7b5 100644 --- a/python/sandcrawler/html_ingest.py +++ b/python/sandcrawler/html_ingest.py @@ -236,6 +236,7 @@ def html_guess_scope(url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata] - component - issue-fulltext - landingpage + - homepage-domain - blocked-paywall - blocked-login - blocked-captcha @@ -249,6 +250,9 @@ def html_guess_scope(url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata] fulltext or a landing page, but could be one of the other categories. """ + # assert that this is a real URL + assert url.count('/') >= 2 + # basic paywall and loginwall detection based on URL if url.endswith("/cookieAbsent"): return "blocked-cookie" @@ -264,6 +268,10 @@ def html_guess_scope(url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata] if "showcaptcha.asp" in url: return "blocked-captcha" + # is this the top-level URL of the domain? aka, no path? + if url.count('/') <= 2 or (url.count('/') == 3) and url.endswith('/'): + return "homepage-domain" + platform = html_guess_platform(url, doc, biblio) if biblio: |