diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-10-26 18:12:23 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-10-26 18:12:23 -0700 |
commit | 485dd2cfd120c52bbc5cc7745e44176d1003b40d (patch) | |
tree | 966bf78a4bd3cc1f6c94efb8fc3054a8a441dab0 /python | |
parent | 7087e7f65d8b81e29af44a43c1067bb2ec618c4e (diff) | |
download | sandcrawler-485dd2cfd120c52bbc5cc7745e44176d1003b40d.tar.gz sandcrawler-485dd2cfd120c52bbc5cc7745e44176d1003b40d.zip |
lint collection membership (last lint for now)
Diffstat (limited to 'python')
-rw-r--r-- | python/sandcrawler/db.py | 2 | ||||
-rw-r--r-- | python/sandcrawler/fileset_platforms.py | 4 | ||||
-rw-r--r-- | python/sandcrawler/html.py | 12 | ||||
-rw-r--r-- | python/sandcrawler/html_metadata.py | 18 | ||||
-rw-r--r-- | python/sandcrawler/ia.py | 14 | ||||
-rw-r--r-- | python/sandcrawler/misc.py | 4 | ||||
-rw-r--r-- | python/sandcrawler/persist.py | 10 |
7 files changed, 32 insertions, 32 deletions
diff --git a/python/sandcrawler/db.py b/python/sandcrawler/db.py index 05fedc6..ee4d3bf 100644 --- a/python/sandcrawler/db.py +++ b/python/sandcrawler/db.py @@ -208,7 +208,7 @@ class SandcrawlerPostgresClient: # though (to save database space) dupe_fields = ('fatcat_release', 'grobid_version') for k in dupe_fields: - if not k in r: + if k not in r: r[k] = r['metadata'].get(k) r['metadata'].pop(k, None) r['metadata'] = json.dumps(r['metadata'], sort_keys=True) diff --git a/python/sandcrawler/fileset_platforms.py b/python/sandcrawler/fileset_platforms.py index 6d66d81..b6808b5 100644 --- a/python/sandcrawler/fileset_platforms.py +++ b/python/sandcrawler/fileset_platforms.py @@ -369,7 +369,7 @@ class FigshareHelper(FilesetPlatformHelper): platform_domain = components.netloc.split(':')[0].lower() # only work with full, versioned figshare.com URLs - if not 'figshare.com' in platform_domain: + if 'figshare.com' not in platform_domain: return False try: @@ -537,7 +537,7 @@ class ZenodoHelper(FilesetPlatformHelper): platform_id = components.path.split('/')[2] assert platform_id.isdigit(), f"expected numeric: {platform_id}" - if not 'zenodo.org' in platform_domain: + if 'zenodo.org' not in platform_domain: raise PlatformScopeError(f"unexpected zenodo.org domain: {platform_domain}") # 2. API fetch diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py index abd3d50..4d36573 100644 --- a/python/sandcrawler/html.py +++ b/python/sandcrawler/html.py @@ -48,7 +48,7 @@ def extract_fulltext_url(html_url: str, html_body: bytes) -> Dict[str, str]: if meta and not meta.get('content'): meta = None # wiley has a weird almost-blank page we don't want to loop on - if meta and not "://onlinelibrary.wiley.com/doi/pdf/" in html_url: + if meta and "://onlinelibrary.wiley.com/doi/pdf/" not in html_url: url = meta['content'].strip() if '://doi.org/' in url: print(f"\tdoi.org in citation_pdf_url (loop?): {url}", file=sys.stderr) @@ -198,7 +198,7 @@ def extract_fulltext_url(html_url: str, html_body: bytes) -> Dict[str, str]: # american archivist (OA) # https://americanarchivist.org/doi/abs/10.17723/aarc.62.2.j475270470145630 - if "://americanarchivist.org/doi/" in html_url and not "/doi/pdf" in html_url: + if "://americanarchivist.org/doi/" in html_url and "/doi/pdf" not in html_url: # use a more aggressive direct guess to avoid rate-limiting... if "/doi/10." in html_url: url = html_url.replace("/doi/10.", "/doi/pdf/10.") @@ -240,7 +240,7 @@ def extract_fulltext_url(html_url: str, html_body: bytes) -> Dict[str, str]: # www.ahajournals.org # https://www.ahajournals.org/doi/10.1161/circ.110.19.2977 - if "://www.ahajournals.org/doi/" in html_url and not '/doi/pdf/' in html_url: + if "://www.ahajournals.org/doi/" in html_url and '/doi/pdf/' not in html_url: # <a href="/doi/pdf/10.1161/circ.110.19.2977?download=true">PDF download</a> if b'/doi/pdf/10.' in html_body: url = html_url.replace('/doi/10.', '/doi/pdf/10.') @@ -259,7 +259,7 @@ def extract_fulltext_url(html_url: str, html_body: bytes) -> Dict[str, str]: # cogentoa.com # https://www.cogentoa.com/article/10.1080/23311975.2017.1412873 - if "://www.cogentoa.com/article/" in html_url and not ".pdf" in html_url: + if "://www.cogentoa.com/article/" in html_url and ".pdf" not in html_url: # blech, it's a SPA! All JS # https://www.cogentoa.com/article/10.1080/23311975.2017.1412873.pdf url = html_url + ".pdf" @@ -321,14 +321,14 @@ def extract_fulltext_url(html_url: str, html_body: bytes) -> Dict[str, str]: # JMIR # https://mhealth.jmir.org/2020/7/e17891/ - if '.jmir.org/' in html_url and not "/pdf" in html_url and html_url.endswith("/"): + if '.jmir.org/' in html_url and "/pdf" not in html_url and html_url.endswith("/"): url = html_url + "pdf" return dict(pdf_url=url, technique='jmir-url') ### below here we are doing guesses # generic guess: try current URL plus .pdf, if it exists in the HTML body - if not '.pdf' in html_url: + if '.pdf' not in html_url: url = html_url + ".pdf" if url.encode('utf-8') in html_body: return dict(pdf_url=url, technique='guess-url-plus-pdf') diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py index ab0fd61..e2e673f 100644 --- a/python/sandcrawler/html_metadata.py +++ b/python/sandcrawler/html_metadata.py @@ -656,10 +656,10 @@ def html_extract_fulltext_url(doc_url: str, doc: HTMLParser, """ self_doc_url: Optional[Tuple[str, str]] = None for pattern in patterns: - if not 'selector' in pattern: + if 'selector' not in pattern: continue if 'in_doc_url' in pattern: - if not pattern['in_doc_url'] in doc_url: + if pattern['in_doc_url'] not in doc_url: continue elem = doc.css_first(pattern['selector']) if not elem: @@ -669,14 +669,14 @@ def html_extract_fulltext_url(doc_url: str, doc: HTMLParser, val = elem.attrs.get(pattern['attr']) elif pattern.get('use_body'): val = elem.text() - if not '://' in val: + if '://' not in val: continue if not val: continue val = urllib.parse.urljoin(doc_url, val) assert val if 'in_fulltext_url' in pattern: - if not pattern['in_fulltext_url'] in val: + if pattern['in_fulltext_url'] not in val: continue for skip_pattern in FULLTEXT_URL_PATTERNS_SKIP: if skip_pattern in val.lower(): @@ -714,7 +714,7 @@ def html_extract_biblio(doc_url: str, doc: HTMLParser) -> Optional[BiblioMetadat if val_list: for val in val_list: if 'content' in val.attrs and val.attrs['content']: - if not field in meta: + if field not in meta: meta[field] = [] meta[field].append(val.attrs['content']) break @@ -740,13 +740,13 @@ def html_extract_biblio(doc_url: str, doc: HTMLParser) -> Optional[BiblioMetadat raw_identifiers = meta.pop('raw_identifiers', []) for ident in raw_identifiers: if ident.startswith('doi:10.'): - if not 'doi' in meta: + if 'doi' not in meta: meta['doi'] = ident.replace('doi:', '') elif ident.startswith('10.') and '/' in ident: - if not 'doi' in meta: + if 'doi' not in meta: meta['doi'] = ident elif ident.startswith('isbn:'): - if not 'isbn' in meta: + if 'isbn' not in meta: meta['isbn'] = ident.replace('isbn:', '') raw_date = meta.pop('raw_date', None) @@ -813,7 +813,7 @@ def _extract_generic(doc: HTMLParser, selector: str, attrs: List[str], for node in doc.css(selector): for attr in attrs: - if not attr in node.attrs: + if attr not in node.attrs: continue url = node.attrs.get(attr) # special-case a couple meta URI prefixes which don't match with adblock rules diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index aa4752e..b413bc8 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -399,7 +399,7 @@ class WaybackClient: """ if not self.petabox_webdata_secret: raise Exception("WaybackClient needs petabox secret to do direct WARC fetches") - if not "/" in warc_path: + if "/" not in warc_path: raise ValueError( "what looks like a liveweb/SPN temporary warc path: {}".format(warc_path)) warc_uri = self.warc_uri_prefix + warc_path @@ -579,9 +579,9 @@ class WaybackClient: #print(resp.url, file=sys.stderr) # defensively check that this is actually correct replay based on headers - if not "X-Archive-Src" in resp.headers: + if "X-Archive-Src" not in resp.headers: raise WaybackError("replay fetch didn't return X-Archive-Src in headers") - if not datetime in resp.url: + if datetime not in resp.url: raise WaybackError("didn't get exact reply (redirect?) datetime:{} got:{}".format( datetime, resp.url)) @@ -634,9 +634,9 @@ class WaybackClient: # defensively check that this is actually correct replay based on headers # previously check for "X-Archive-Redirect-Reason" here - if not "X-Archive-Src" in resp.headers: + if "X-Archive-Src" not in resp.headers: raise WaybackError("redirect replay fetch didn't return X-Archive-Src in headers") - if not datetime in resp.url: + if datetime not in resp.url: raise WaybackError("didn't get exact reply (redirect?) datetime:{} got:{}".format( datetime, resp.url)) @@ -772,7 +772,7 @@ class WaybackClient: cdx=cdx_row, revisit_cdx=None, ) - if not "://" in resource.location: + if "://" not in resource.location: next_url = urllib.parse.urljoin(next_url, resource.location) else: next_url = resource.location @@ -1087,7 +1087,7 @@ class SavePageNowClient: #print(spn_result, file=sys.stderr) # detect partial URL response (aka, success, but missing full URL) - if not "://" in spn_result.terminal_url or spn_result.terminal_url.startswith('/'): + if "://" not in spn_result.terminal_url or spn_result.terminal_url.startswith('/'): return ResourceResult( start_url=start_url, hit=False, diff --git a/python/sandcrawler/misc.py b/python/sandcrawler/misc.py index 83a4626..1c779ce 100644 --- a/python/sandcrawler/misc.py +++ b/python/sandcrawler/misc.py @@ -58,7 +58,7 @@ def gen_file_metadata(blob: bytes, allow_empty: bool = False) -> dict: # crude checks for XHTML or JATS XML, using only first 1 kB of file if b"<htm" in blob[:1024] and b'xmlns="http://www.w3.org/1999/xhtml"' in blob[:1024]: mimetype = "application/xhtml+xml" - elif b"<article " in blob[:1024] and not b"<html" in blob[:1024]: + elif b"<article " in blob[:1024] and b"<html" not in blob[:1024]: mimetype = "application/jats+xml" hashes = [ hashlib.sha1(), @@ -88,7 +88,7 @@ def gen_file_metadata_path(path: str, allow_empty: bool = False) -> dict: # crude checks for XHTML or JATS XML, using only first 1 kB of file if b"<htm" in blob[:1024] and b'xmlns="http://www.w3.org/1999/xhtml"' in blob[:1024]: mimetype = "application/xhtml+xml" - elif b"<article " in blob[:1024] and not b"<html" in blob[:1024]: + elif b"<article " in blob[:1024] and b"<html" not in blob[:1024]: mimetype = "application/jats+xml" hashes = [ hashlib.sha1(), diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py index d47a8cb..8ec5979 100644 --- a/python/sandcrawler/persist.py +++ b/python/sandcrawler/persist.py @@ -86,7 +86,7 @@ class PersistIngestFileResultWorker(SandcrawlerWorker): raw['link_source_id'] = raw['fatcat']['release_ident'] for k in ('ingest_type', 'base_url', 'link_source', 'link_source_id'): - if not k in raw: + if k not in raw: self.counts['skip-request-fields'] += 1 return None if raw['ingest_type'] not in ('pdf', 'xml', 'html'): @@ -120,10 +120,10 @@ class PersistIngestFileResultWorker(SandcrawlerWorker): if there is a problem with conversion, return None and set skip count """ for k in ('request', 'hit', 'status'): - if not k in raw: + if k not in raw: self.counts['skip-result-fields'] += 1 return None - if not 'base_url' in raw['request']: + if 'base_url' not in raw['request']: self.counts['skip-result-fields'] += 1 return None ingest_type = raw['request'].get('ingest_type') @@ -181,9 +181,9 @@ class PersistIngestFileResultWorker(SandcrawlerWorker): if there is a problem with conversion, return None and set skip count """ for k in ('request', 'hit', 'status'): - if not k in raw: + if k not in raw: return None - if not 'base_url' in raw['request']: + if 'base_url' not in raw['request']: return None ingest_type = raw['request'].get('ingest_type') if ingest_type not in ('dataset'): |