html: start improving scope detection

author: Bryan Newbold <bnewbold@archive.org> 2020-11-08 14:28:24 -0800
committer: Bryan Newbold <bnewbold@archive.org> 2020-11-08 14:28:24 -0800
commit: 5d525e9744303bf5ddcf673623483d4a6a787326 (patch)
tree: 14421cb165977aeeb80d652d582a65af7a44e304 /python
parent: 5a9e8d9441662c508cf583114b9edc85cc608587 (diff)
download: sandcrawler-5d525e9744303bf5ddcf673623483d4a6a787326.tar.gz
sandcrawler-5d525e9744303bf5ddcf673623483d4a6a787326.zip
2 files changed, 49 insertions, 5 deletions
diff --git a/python/sandcrawler/html_ingest.py b/python/sandcrawler/html_ingest.py
index 03ec6f4..42bd946 100644
--- a/python/sandcrawler/html_ingest.py
+++ b/python/sandcrawler/html_ingest.py
@@ -174,6 +174,28 @@ def fetch_html_resources(resources: List[dict], wayback_client: WaybackClient, w
     return full
 
 
+def html_guess_platform(url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata]) -> Optional[str]:
+    generator: Optional[str] = None
+    platform: Optional[str] = None
+    generator_elem = doc.css_first("meta[name='generator']")
+    if generator_elem:
+        generator = generator_elem.attrs['content']
+    else:
+        generator_elem = doc.css_first("a[id='developedBy']")
+        if generator_elem:
+            generator = generator_elem.text()
+    if generator and "open journal systems 3" in generator.lower():
+        platform = "ojs3"
+    elif generator and "open journal systems" in generator.lower():
+        platform = "ojs"
+    elif 'powered by <a target="blank" href="http://pkp.sfu.ca/ojs/">PKP OJS</a>' in doc.html:
+        platform = "ojs"
+    elif doc.css_first("body[id='pkp-common-openJournalSystems']"):
+        platform = "ojs"
+    print(f"  HTML platform: {platform} generator: {generator}", file=sys.stderr)
+    return platform
+
+
 def html_guess_scope(url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata], word_count: Optional[int]) -> str:
     """
     This function tries to guess if an HTML document represents one of:
@@ -190,7 +212,11 @@ def html_guess_scope(url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata]
     - blockpage
     - errorpage
     - stub
+    - other
     - unknown
+
+    Unknown implies the page could be anything. "other" implies it is not
+    fulltext or a landing page, but could be one of the other categories.
     """
 
     # basic paywall and loginwall detection based on URL
@@ -205,17 +231,35 @@ def html_guess_scope(url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata]
         if "sci_arttext" in url:
             return "article-fulltext"
 
-    if biblio and biblio.html_fulltext_url == url:
-        return "article-fulltext"
+    platform = html_guess_platform(url, doc, biblio)
+
+    if biblio:
+        if biblio.html_fulltext_url == url:
+            return "article-fulltext"
+        elif biblio.html_fulltext_url:
+            return "landingpage"
+
+    # OJS-specific detection
+    if platform in ("ojs", "ojs3"):
+
+        if biblio and biblio.title:
+            if word_count and word_count > 1200:
+                return "fulltext"
+            else:
+                return "landingpage"
+        else:
+            if "/article/view/" in url and word_count and word_count > 600:
+                return "fulltext"
+        return "other"
 
-    # fallback: guess based word count (arbitrary guesses here)
+    # fallback: guess based on word count (arbitrary guesses here)
     if word_count == None:
         return "unknown"
     #print(f"  body text word count: {word_count}", file=sys.stderr)
     assert word_count is not None
     if word_count < 20:
         return "stub"
-    elif word_count > 800:
+    elif word_count > 1200:
         return "article-fulltext"
 
     return "unknown"
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index 5d31d62..d95b8bf 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -353,7 +353,7 @@ class IngestFileWorker(SandcrawlerWorker):
         if html_scope not in ('article-fulltext', 'unknown'):
             html_body.pop("tei_xml", None)
             return dict(
-                status="html-body-wrong-scope",
+                status="wrong-scope",
                 html_biblio=html_biblio_dict,
                 html_scope=html_scope,
             )
author	Bryan Newbold <bnewbold@archive.org>	2020-11-08 14:28:24 -0800
committer	Bryan Newbold <bnewbold@archive.org>	2020-11-08 14:28:24 -0800
commit	5d525e9744303bf5ddcf673623483d4a6a787326 (patch)
tree	14421cb165977aeeb80d652d582a65af7a44e304 /python
parent	5a9e8d9441662c508cf583114b9edc85cc608587 (diff)
download	sandcrawler-5d525e9744303bf5ddcf673623483d4a6a787326.tar.gz sandcrawler-5d525e9744303bf5ddcf673623483d4a6a787326.zip