diff options
Diffstat (limited to 'python/sandcrawler/html_metadata.py')
-rw-r--r-- | python/sandcrawler/html_metadata.py | 9 |
1 files changed, 4 insertions, 5 deletions
diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py index 93c7269..c6725dc 100644 --- a/python/sandcrawler/html_metadata.py +++ b/python/sandcrawler/html_metadata.py @@ -1,17 +1,16 @@ -import sys import datetime -from typing import List, Optional, Any, Tuple, Dict +import sys import urllib.parse +from typing import Any, Dict, List, Optional, Tuple +import braveblock import dateparser -from selectolax.parser import HTMLParser import pydantic -import braveblock +from selectolax.parser import HTMLParser from sandcrawler.misc import url_fuzzy_equal - # this is a map of metadata keys to CSS selectors # sources for this list include: # - google scholar crawling notes (https://scholar.google.com/intl/ja/scholar/inclusion.html#indexing) |