diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-08-16 20:17:30 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-08-16 20:17:30 -0700 |
commit | e1cde3c95e5176f232ecbc22a8619149078dc91f (patch) | |
tree | 2624b700015663272e5d9edd21d7bf180e3803b6 | |
parent | 26d90505bda2d1dfcc25af6b8a0270faa11729e7 (diff) | |
download | sandcrawler-e1cde3c95e5176f232ecbc22a8619149078dc91f.tar.gz sandcrawler-e1cde3c95e5176f232ecbc22a8619149078dc91f.zip |
html ingest: detect some blog platforms, and allow lower wordcount threshold
-rw-r--r-- | python/sandcrawler/html_ingest.py | 6 |
1 files changed, 6 insertions, 0 deletions
diff --git a/python/sandcrawler/html_ingest.py b/python/sandcrawler/html_ingest.py index 115e7b5..3e57a04 100644 --- a/python/sandcrawler/html_ingest.py +++ b/python/sandcrawler/html_ingest.py @@ -200,6 +200,10 @@ def html_guess_platform(url: str, doc: HTMLParser, biblio: Optional[BiblioMetada return "ojs" elif generator and "plone" in generator.lower(): return "plone" + elif generator and "wordpress" in generator.lower(): + return "wordpress" + elif generator and "blogger" in generator.lower(): + return "blogger" elif doc.css_first("body[id='pkp-common-openJournalSystems']"): return "ojs" else: @@ -316,6 +320,8 @@ def html_guess_scope(url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata] if word_count is not None: if word_count < 20: return "stub" + elif word_count > 500 and platform in ['wordpress', 'blogger']: + return "article-fulltext" elif word_count > 1200: return "article-fulltext" |