aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-11-08 21:55:31 -0800
committerBryan Newbold <bnewbold@archive.org>2020-11-08 21:55:31 -0800
commit95e92a112800bfe71d7b89e4cdf28075bde3542d (patch)
treecb38af654fd0bbd1b57bc719c239a8e830a63228 /python
parentb6911f63a277007523e0dc265a339a80be80946e (diff)
downloadsandcrawler-95e92a112800bfe71d7b89e4cdf28075bde3542d.tar.gz
sandcrawler-95e92a112800bfe71d7b89e4cdf28075bde3542d.zip
html: most small platform tweaks
Diffstat (limited to 'python')
-rw-r--r--python/sandcrawler/html_ingest.py9
1 files changed, 4 insertions, 5 deletions
diff --git a/python/sandcrawler/html_ingest.py b/python/sandcrawler/html_ingest.py
index 958e81f..50b193c 100644
--- a/python/sandcrawler/html_ingest.py
+++ b/python/sandcrawler/html_ingest.py
@@ -206,6 +206,10 @@ def html_guess_platform(url: str, doc: HTMLParser, biblio: Optional[BiblioMetada
try:
if 'powered by <a target="blank" href="http://pkp.sfu.ca/ojs/">PKP OJS</a>' in doc.html:
return "ojs"
+ if 'Powered by <a target="_blank" href="http://arphahub.com">' in doc.html:
+ return "arpha"
+ if "<meta property='og:image' content='http://cms.galenos.com.tr' />" in doc.html:
+ return "galenos"
except UnicodeDecodeError:
pass
@@ -243,11 +247,6 @@ def html_guess_scope(url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata]
Unknown implies the page could be anything. "other" implies it is not
fulltext or a landing page, but could be one of the other categories.
-
- TODO: known javascript-heavy single-page-app:
- - https://riojournal.com/article/35913/
- - https://phmd.pl/resources/html/article/details?id=175497&language=en
- - https://dez.pensoft.net/articles.php?id=11704
"""
# basic paywall and loginwall detection based on URL