diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-10-30 15:17:14 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-10-30 15:17:14 -0700 |
commit | 08bf16e6da9666bb81e4d1ecddff48fe7cf9205c (patch) | |
tree | 41552977a735d13152a6fe01704b839633e121c1 /python/sandcrawler/html_metadata.py | |
parent | 24bfdfaa260156e395c509f0c18657e79dc6f730 (diff) | |
download | sandcrawler-08bf16e6da9666bb81e4d1ecddff48fe7cf9205c.tar.gz sandcrawler-08bf16e6da9666bb81e4d1ecddff48fe7cf9205c.zip |
html: more ingest improvements
Diffstat (limited to 'python/sandcrawler/html_metadata.py')
-rw-r--r-- | python/sandcrawler/html_metadata.py | 2 |
1 files changed, 2 insertions, 0 deletions
diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py index 6b1bdef..d3ca1b7 100644 --- a/python/sandcrawler/html_metadata.py +++ b/python/sandcrawler/html_metadata.py @@ -14,6 +14,8 @@ import braveblock # - google scholar crawling notes (https://scholar.google.com/intl/ja/scholar/inclusion.html#indexing) # - inspection of actual publisher HTML # - http://div.div1.com.au/div-thoughts/div-commentaries/66-div-commentary-metadata +# - "HTML meta tags used by journal articles" +# https://gist.github.com/hubgit/5985963 # order of these are mostly by preference/quality (best option first), though # also/sometimes re-ordered for lookup efficiency (lookup stops after first # match) |