From 08bf16e6da9666bb81e4d1ecddff48fe7cf9205c Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 30 Oct 2020 15:17:14 -0700 Subject: html: more ingest improvements --- python/sandcrawler/html_metadata.py | 2 ++ 1 file changed, 2 insertions(+) (limited to 'python/sandcrawler/html_metadata.py') diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py index 6b1bdef..d3ca1b7 100644 --- a/python/sandcrawler/html_metadata.py +++ b/python/sandcrawler/html_metadata.py @@ -14,6 +14,8 @@ import braveblock # - google scholar crawling notes (https://scholar.google.com/intl/ja/scholar/inclusion.html#indexing) # - inspection of actual publisher HTML # - http://div.div1.com.au/div-thoughts/div-commentaries/66-div-commentary-metadata +# - "HTML meta tags used by journal articles" +# https://gist.github.com/hubgit/5985963 # order of these are mostly by preference/quality (best option first), though # also/sometimes re-ordered for lookup efficiency (lookup stops after first # match) -- cgit v1.2.3