aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/html_metadata.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-10-30 15:17:14 -0700
committerBryan Newbold <bnewbold@archive.org>2020-10-30 15:17:14 -0700
commit08bf16e6da9666bb81e4d1ecddff48fe7cf9205c (patch)
tree41552977a735d13152a6fe01704b839633e121c1 /python/sandcrawler/html_metadata.py
parent24bfdfaa260156e395c509f0c18657e79dc6f730 (diff)
downloadsandcrawler-08bf16e6da9666bb81e4d1ecddff48fe7cf9205c.tar.gz
sandcrawler-08bf16e6da9666bb81e4d1ecddff48fe7cf9205c.zip
html: more ingest improvements
Diffstat (limited to 'python/sandcrawler/html_metadata.py')
-rw-r--r--python/sandcrawler/html_metadata.py2
1 files changed, 2 insertions, 0 deletions
diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py
index 6b1bdef..d3ca1b7 100644
--- a/python/sandcrawler/html_metadata.py
+++ b/python/sandcrawler/html_metadata.py
@@ -14,6 +14,8 @@ import braveblock
# - google scholar crawling notes (https://scholar.google.com/intl/ja/scholar/inclusion.html#indexing)
# - inspection of actual publisher HTML
# - http://div.div1.com.au/div-thoughts/div-commentaries/66-div-commentary-metadata
+# - "HTML meta tags used by journal articles"
+# https://gist.github.com/hubgit/5985963
# order of these are mostly by preference/quality (best option first), though
# also/sometimes re-ordered for lookup efficiency (lookup stops after first
# match)