diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-10-26 18:12:23 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-10-26 18:12:23 -0700 |
commit | 485dd2cfd120c52bbc5cc7745e44176d1003b40d (patch) | |
tree | 966bf78a4bd3cc1f6c94efb8fc3054a8a441dab0 /python/sandcrawler/html_metadata.py | |
parent | 7087e7f65d8b81e29af44a43c1067bb2ec618c4e (diff) | |
download | sandcrawler-485dd2cfd120c52bbc5cc7745e44176d1003b40d.tar.gz sandcrawler-485dd2cfd120c52bbc5cc7745e44176d1003b40d.zip |
lint collection membership (last lint for now)
Diffstat (limited to 'python/sandcrawler/html_metadata.py')
-rw-r--r-- | python/sandcrawler/html_metadata.py | 18 |
1 files changed, 9 insertions, 9 deletions
diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py index ab0fd61..e2e673f 100644 --- a/python/sandcrawler/html_metadata.py +++ b/python/sandcrawler/html_metadata.py @@ -656,10 +656,10 @@ def html_extract_fulltext_url(doc_url: str, doc: HTMLParser, """ self_doc_url: Optional[Tuple[str, str]] = None for pattern in patterns: - if not 'selector' in pattern: + if 'selector' not in pattern: continue if 'in_doc_url' in pattern: - if not pattern['in_doc_url'] in doc_url: + if pattern['in_doc_url'] not in doc_url: continue elem = doc.css_first(pattern['selector']) if not elem: @@ -669,14 +669,14 @@ def html_extract_fulltext_url(doc_url: str, doc: HTMLParser, val = elem.attrs.get(pattern['attr']) elif pattern.get('use_body'): val = elem.text() - if not '://' in val: + if '://' not in val: continue if not val: continue val = urllib.parse.urljoin(doc_url, val) assert val if 'in_fulltext_url' in pattern: - if not pattern['in_fulltext_url'] in val: + if pattern['in_fulltext_url'] not in val: continue for skip_pattern in FULLTEXT_URL_PATTERNS_SKIP: if skip_pattern in val.lower(): @@ -714,7 +714,7 @@ def html_extract_biblio(doc_url: str, doc: HTMLParser) -> Optional[BiblioMetadat if val_list: for val in val_list: if 'content' in val.attrs and val.attrs['content']: - if not field in meta: + if field not in meta: meta[field] = [] meta[field].append(val.attrs['content']) break @@ -740,13 +740,13 @@ def html_extract_biblio(doc_url: str, doc: HTMLParser) -> Optional[BiblioMetadat raw_identifiers = meta.pop('raw_identifiers', []) for ident in raw_identifiers: if ident.startswith('doi:10.'): - if not 'doi' in meta: + if 'doi' not in meta: meta['doi'] = ident.replace('doi:', '') elif ident.startswith('10.') and '/' in ident: - if not 'doi' in meta: + if 'doi' not in meta: meta['doi'] = ident elif ident.startswith('isbn:'): - if not 'isbn' in meta: + if 'isbn' not in meta: meta['isbn'] = ident.replace('isbn:', '') raw_date = meta.pop('raw_date', None) @@ -813,7 +813,7 @@ def _extract_generic(doc: HTMLParser, selector: str, attrs: List[str], for node in doc.css(selector): for attr in attrs: - if not attr in node.attrs: + if attr not in node.attrs: continue url = node.attrs.get(attr) # special-case a couple meta URI prefixes which don't match with adblock rules |