aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-11-08 14:28:24 -0800
committerBryan Newbold <bnewbold@archive.org>2020-11-08 14:28:24 -0800
commit5d525e9744303bf5ddcf673623483d4a6a787326 (patch)
tree14421cb165977aeeb80d652d582a65af7a44e304
parent5a9e8d9441662c508cf583114b9edc85cc608587 (diff)
downloadsandcrawler-5d525e9744303bf5ddcf673623483d4a6a787326.tar.gz
sandcrawler-5d525e9744303bf5ddcf673623483d4a6a787326.zip
html: start improving scope detection
-rw-r--r--python/sandcrawler/html_ingest.py52
-rw-r--r--python/sandcrawler/ingest.py2
2 files changed, 49 insertions, 5 deletions
diff --git a/python/sandcrawler/html_ingest.py b/python/sandcrawler/html_ingest.py
index 03ec6f4..42bd946 100644
--- a/python/sandcrawler/html_ingest.py
+++ b/python/sandcrawler/html_ingest.py
@@ -174,6 +174,28 @@ def fetch_html_resources(resources: List[dict], wayback_client: WaybackClient, w
return full
+def html_guess_platform(url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata]) -> Optional[str]:
+ generator: Optional[str] = None
+ platform: Optional[str] = None
+ generator_elem = doc.css_first("meta[name='generator']")
+ if generator_elem:
+ generator = generator_elem.attrs['content']
+ else:
+ generator_elem = doc.css_first("a[id='developedBy']")
+ if generator_elem:
+ generator = generator_elem.text()
+ if generator and "open journal systems 3" in generator.lower():
+ platform = "ojs3"
+ elif generator and "open journal systems" in generator.lower():
+ platform = "ojs"
+ elif 'powered by <a target="blank" href="http://pkp.sfu.ca/ojs/">PKP OJS</a>' in doc.html:
+ platform = "ojs"
+ elif doc.css_first("body[id='pkp-common-openJournalSystems']"):
+ platform = "ojs"
+ print(f" HTML platform: {platform} generator: {generator}", file=sys.stderr)
+ return platform
+
+
def html_guess_scope(url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata], word_count: Optional[int]) -> str:
"""
This function tries to guess if an HTML document represents one of:
@@ -190,7 +212,11 @@ def html_guess_scope(url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata]
- blockpage
- errorpage
- stub
+ - other
- unknown
+
+ Unknown implies the page could be anything. "other" implies it is not
+ fulltext or a landing page, but could be one of the other categories.
"""
# basic paywall and loginwall detection based on URL
@@ -205,17 +231,35 @@ def html_guess_scope(url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata]
if "sci_arttext" in url:
return "article-fulltext"
- if biblio and biblio.html_fulltext_url == url:
- return "article-fulltext"
+ platform = html_guess_platform(url, doc, biblio)
+
+ if biblio:
+ if biblio.html_fulltext_url == url:
+ return "article-fulltext"
+ elif biblio.html_fulltext_url:
+ return "landingpage"
+
+ # OJS-specific detection
+ if platform in ("ojs", "ojs3"):
+
+ if biblio and biblio.title:
+ if word_count and word_count > 1200:
+ return "fulltext"
+ else:
+ return "landingpage"
+ else:
+ if "/article/view/" in url and word_count and word_count > 600:
+ return "fulltext"
+ return "other"
- # fallback: guess based word count (arbitrary guesses here)
+ # fallback: guess based on word count (arbitrary guesses here)
if word_count == None:
return "unknown"
#print(f" body text word count: {word_count}", file=sys.stderr)
assert word_count is not None
if word_count < 20:
return "stub"
- elif word_count > 800:
+ elif word_count > 1200:
return "article-fulltext"
return "unknown"
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index 5d31d62..d95b8bf 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -353,7 +353,7 @@ class IngestFileWorker(SandcrawlerWorker):
if html_scope not in ('article-fulltext', 'unknown'):
html_body.pop("tei_xml", None)
return dict(
- status="html-body-wrong-scope",
+ status="wrong-scope",
html_biblio=html_biblio_dict,
html_scope=html_scope,
)