aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-08-16 20:17:30 -0700
committerBryan Newbold <bnewbold@archive.org>2021-08-16 20:17:30 -0700
commite1cde3c95e5176f232ecbc22a8619149078dc91f (patch)
tree2624b700015663272e5d9edd21d7bf180e3803b6
parent26d90505bda2d1dfcc25af6b8a0270faa11729e7 (diff)
downloadsandcrawler-e1cde3c95e5176f232ecbc22a8619149078dc91f.tar.gz
sandcrawler-e1cde3c95e5176f232ecbc22a8619149078dc91f.zip
html ingest: detect some blog platforms, and allow lower wordcount threshold
-rw-r--r--python/sandcrawler/html_ingest.py6
1 files changed, 6 insertions, 0 deletions
diff --git a/python/sandcrawler/html_ingest.py b/python/sandcrawler/html_ingest.py
index 115e7b5..3e57a04 100644
--- a/python/sandcrawler/html_ingest.py
+++ b/python/sandcrawler/html_ingest.py
@@ -200,6 +200,10 @@ def html_guess_platform(url: str, doc: HTMLParser, biblio: Optional[BiblioMetada
return "ojs"
elif generator and "plone" in generator.lower():
return "plone"
+ elif generator and "wordpress" in generator.lower():
+ return "wordpress"
+ elif generator and "blogger" in generator.lower():
+ return "blogger"
elif doc.css_first("body[id='pkp-common-openJournalSystems']"):
return "ojs"
else:
@@ -316,6 +320,8 @@ def html_guess_scope(url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata]
if word_count is not None:
if word_count < 20:
return "stub"
+ elif word_count > 500 and platform in ['wordpress', 'blogger']:
+ return "article-fulltext"
elif word_count > 1200:
return "article-fulltext"