diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-11-08 14:28:24 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-11-08 14:28:24 -0800 |
commit | 5d525e9744303bf5ddcf673623483d4a6a787326 (patch) | |
tree | 14421cb165977aeeb80d652d582a65af7a44e304 | |
parent | 5a9e8d9441662c508cf583114b9edc85cc608587 (diff) | |
download | sandcrawler-5d525e9744303bf5ddcf673623483d4a6a787326.tar.gz sandcrawler-5d525e9744303bf5ddcf673623483d4a6a787326.zip |
html: start improving scope detection
-rw-r--r-- | python/sandcrawler/html_ingest.py | 52 | ||||
-rw-r--r-- | python/sandcrawler/ingest.py | 2 |
2 files changed, 49 insertions, 5 deletions
diff --git a/python/sandcrawler/html_ingest.py b/python/sandcrawler/html_ingest.py index 03ec6f4..42bd946 100644 --- a/python/sandcrawler/html_ingest.py +++ b/python/sandcrawler/html_ingest.py @@ -174,6 +174,28 @@ def fetch_html_resources(resources: List[dict], wayback_client: WaybackClient, w return full +def html_guess_platform(url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata]) -> Optional[str]: + generator: Optional[str] = None + platform: Optional[str] = None + generator_elem = doc.css_first("meta[name='generator']") + if generator_elem: + generator = generator_elem.attrs['content'] + else: + generator_elem = doc.css_first("a[id='developedBy']") + if generator_elem: + generator = generator_elem.text() + if generator and "open journal systems 3" in generator.lower(): + platform = "ojs3" + elif generator and "open journal systems" in generator.lower(): + platform = "ojs" + elif 'powered by <a target="blank" href="http://pkp.sfu.ca/ojs/">PKP OJS</a>' in doc.html: + platform = "ojs" + elif doc.css_first("body[id='pkp-common-openJournalSystems']"): + platform = "ojs" + print(f" HTML platform: {platform} generator: {generator}", file=sys.stderr) + return platform + + def html_guess_scope(url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata], word_count: Optional[int]) -> str: """ This function tries to guess if an HTML document represents one of: @@ -190,7 +212,11 @@ def html_guess_scope(url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata] - blockpage - errorpage - stub + - other - unknown + + Unknown implies the page could be anything. "other" implies it is not + fulltext or a landing page, but could be one of the other categories. """ # basic paywall and loginwall detection based on URL @@ -205,17 +231,35 @@ def html_guess_scope(url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata] if "sci_arttext" in url: return "article-fulltext" - if biblio and biblio.html_fulltext_url == url: - return "article-fulltext" + platform = html_guess_platform(url, doc, biblio) + + if biblio: + if biblio.html_fulltext_url == url: + return "article-fulltext" + elif biblio.html_fulltext_url: + return "landingpage" + + # OJS-specific detection + if platform in ("ojs", "ojs3"): + + if biblio and biblio.title: + if word_count and word_count > 1200: + return "fulltext" + else: + return "landingpage" + else: + if "/article/view/" in url and word_count and word_count > 600: + return "fulltext" + return "other" - # fallback: guess based word count (arbitrary guesses here) + # fallback: guess based on word count (arbitrary guesses here) if word_count == None: return "unknown" #print(f" body text word count: {word_count}", file=sys.stderr) assert word_count is not None if word_count < 20: return "stub" - elif word_count > 800: + elif word_count > 1200: return "article-fulltext" return "unknown" diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index 5d31d62..d95b8bf 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -353,7 +353,7 @@ class IngestFileWorker(SandcrawlerWorker): if html_scope not in ('article-fulltext', 'unknown'): html_body.pop("tei_xml", None) return dict( - status="html-body-wrong-scope", + status="wrong-scope", html_biblio=html_biblio_dict, html_scope=html_scope, ) |