diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-11-12 10:18:21 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-11-12 10:18:21 -0800 |
commit | fb8497ebcb470a6da99c8d4cee5658b17672b86b (patch) | |
tree | 492226091bc00c322d937223da7030dc7a52fd4b | |
parent | 9592902ee082b9590d34db6b905bc57bdfeb3c00 (diff) | |
download | sandcrawler-fb8497ebcb470a6da99c8d4cee5658b17672b86b.tar.gz sandcrawler-fb8497ebcb470a6da99c8d4cee5658b17672b86b.zip |
html biblio: handle 'content not in attrs' case
-rw-r--r-- | python/sandcrawler/html_metadata.py | 4 |
1 files changed, 2 insertions, 2 deletions
diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py index 367fce4..2082b65 100644 --- a/python/sandcrawler/html_metadata.py +++ b/python/sandcrawler/html_metadata.py @@ -451,7 +451,7 @@ def html_extract_fulltext_url(doc_url: str, doc: HTMLParser, patterns: List[dict if not elem: continue if 'attr' in pattern: - val = elem.attrs[pattern['attr']] + val = elem.attrs.get(pattern['attr']) if not val: continue val = urllib.parse.urljoin(doc_url, val) @@ -492,7 +492,7 @@ def html_extract_biblio(doc_url: str, doc: HTMLParser) -> Optional[BiblioMetadat val_list = head.css(pattern) if val_list: for val in val_list: - if val.attrs['content']: + if val.attrs.get('content'): if not field in meta: meta[field] = [] meta[field].append(val.attrs['content']) |