diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-11-08 21:55:31 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-11-08 21:55:31 -0800 |
commit | 95e92a112800bfe71d7b89e4cdf28075bde3542d (patch) | |
tree | cb38af654fd0bbd1b57bc719c239a8e830a63228 | |
parent | b6911f63a277007523e0dc265a339a80be80946e (diff) | |
download | sandcrawler-95e92a112800bfe71d7b89e4cdf28075bde3542d.tar.gz sandcrawler-95e92a112800bfe71d7b89e4cdf28075bde3542d.zip |
html: most small platform tweaks
-rw-r--r-- | python/sandcrawler/html_ingest.py | 9 |
1 files changed, 4 insertions, 5 deletions
diff --git a/python/sandcrawler/html_ingest.py b/python/sandcrawler/html_ingest.py index 958e81f..50b193c 100644 --- a/python/sandcrawler/html_ingest.py +++ b/python/sandcrawler/html_ingest.py @@ -206,6 +206,10 @@ def html_guess_platform(url: str, doc: HTMLParser, biblio: Optional[BiblioMetada try: if 'powered by <a target="blank" href="http://pkp.sfu.ca/ojs/">PKP OJS</a>' in doc.html: return "ojs" + if 'Powered by <a target="_blank" href="http://arphahub.com">' in doc.html: + return "arpha" + if "<meta property='og:image' content='http://cms.galenos.com.tr' />" in doc.html: + return "galenos" except UnicodeDecodeError: pass @@ -243,11 +247,6 @@ def html_guess_scope(url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata] Unknown implies the page could be anything. "other" implies it is not fulltext or a landing page, but could be one of the other categories. - - TODO: known javascript-heavy single-page-app: - - https://riojournal.com/article/35913/ - - https://phmd.pl/resources/html/article/details?id=175497&language=en - - https://dez.pensoft.net/articles.php?id=11704 """ # basic paywall and loginwall detection based on URL |