lint collection membership (last lint for now)

author: Bryan Newbold <bnewbold@archive.org> 2021-10-26 18:12:23 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2021-10-26 18:12:23 -0700
commit: 485dd2cfd120c52bbc5cc7745e44176d1003b40d (patch)
tree: 966bf78a4bd3cc1f6c94efb8fc3054a8a441dab0 /python
parent: 7087e7f65d8b81e29af44a43c1067bb2ec618c4e (diff)
download: sandcrawler-485dd2cfd120c52bbc5cc7745e44176d1003b40d.tar.gz
sandcrawler-485dd2cfd120c52bbc5cc7745e44176d1003b40d.zip
7 files changed, 32 insertions, 32 deletions
diff --git a/python/sandcrawler/db.py b/python/sandcrawler/db.py
index 05fedc6..ee4d3bf 100644
--- a/python/sandcrawler/db.py
+++ b/python/sandcrawler/db.py
@@ -208,7 +208,7 @@ class SandcrawlerPostgresClient:
                 # though (to save database space)
                 dupe_fields = ('fatcat_release', 'grobid_version')
                 for k in dupe_fields:
-                    if not k in r:
+                    if k not in r:
                         r[k] = r['metadata'].get(k)
                     r['metadata'].pop(k, None)
                 r['metadata'] = json.dumps(r['metadata'], sort_keys=True)
diff --git a/python/sandcrawler/fileset_platforms.py b/python/sandcrawler/fileset_platforms.py
index 6d66d81..b6808b5 100644
--- a/python/sandcrawler/fileset_platforms.py
+++ b/python/sandcrawler/fileset_platforms.py
@@ -369,7 +369,7 @@ class FigshareHelper(FilesetPlatformHelper):
         platform_domain = components.netloc.split(':')[0].lower()
 
         # only work with full, versioned figshare.com URLs
-        if not 'figshare.com' in platform_domain:
+        if 'figshare.com' not in platform_domain:
             return False
 
         try:
@@ -537,7 +537,7 @@ class ZenodoHelper(FilesetPlatformHelper):
         platform_id = components.path.split('/')[2]
         assert platform_id.isdigit(), f"expected numeric: {platform_id}"
 
-        if not 'zenodo.org' in platform_domain:
+        if 'zenodo.org' not in platform_domain:
             raise PlatformScopeError(f"unexpected zenodo.org domain: {platform_domain}")
 
         # 2. API fetch
diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py
index abd3d50..4d36573 100644
--- a/python/sandcrawler/html.py
+++ b/python/sandcrawler/html.py
@@ -48,7 +48,7 @@ def extract_fulltext_url(html_url: str, html_body: bytes) -> Dict[str, str]:
     if meta and not meta.get('content'):
         meta = None
     # wiley has a weird almost-blank page we don't want to loop on
-    if meta and not "://onlinelibrary.wiley.com/doi/pdf/" in html_url:
+    if meta and "://onlinelibrary.wiley.com/doi/pdf/" not in html_url:
         url = meta['content'].strip()
         if '://doi.org/' in url:
             print(f"\tdoi.org in citation_pdf_url (loop?): {url}", file=sys.stderr)
@@ -198,7 +198,7 @@ def extract_fulltext_url(html_url: str, html_body: bytes) -> Dict[str, str]:
 
     # american archivist (OA)
     # https://americanarchivist.org/doi/abs/10.17723/aarc.62.2.j475270470145630
-    if "://americanarchivist.org/doi/" in html_url and not "/doi/pdf" in html_url:
+    if "://americanarchivist.org/doi/" in html_url and "/doi/pdf" not in html_url:
         # use a more aggressive direct guess to avoid rate-limiting...
         if "/doi/10." in html_url:
             url = html_url.replace("/doi/10.", "/doi/pdf/10.")
@@ -240,7 +240,7 @@ def extract_fulltext_url(html_url: str, html_body: bytes) -> Dict[str, str]:
 
     # www.ahajournals.org
     # https://www.ahajournals.org/doi/10.1161/circ.110.19.2977
-    if "://www.ahajournals.org/doi/" in html_url and not '/doi/pdf/' in html_url:
+    if "://www.ahajournals.org/doi/" in html_url and '/doi/pdf/' not in html_url:
         # <a href="/doi/pdf/10.1161/circ.110.19.2977?download=true">PDF download</a>
         if b'/doi/pdf/10.' in html_body:
             url = html_url.replace('/doi/10.', '/doi/pdf/10.')
@@ -259,7 +259,7 @@ def extract_fulltext_url(html_url: str, html_body: bytes) -> Dict[str, str]:
 
     # cogentoa.com
     # https://www.cogentoa.com/article/10.1080/23311975.2017.1412873
-    if "://www.cogentoa.com/article/" in html_url and not ".pdf" in html_url:
+    if "://www.cogentoa.com/article/" in html_url and ".pdf" not in html_url:
         # blech, it's a SPA! All JS
         # https://www.cogentoa.com/article/10.1080/23311975.2017.1412873.pdf
         url = html_url + ".pdf"
@@ -321,14 +321,14 @@ def extract_fulltext_url(html_url: str, html_body: bytes) -> Dict[str, str]:
 
     # JMIR
     # https://mhealth.jmir.org/2020/7/e17891/
-    if '.jmir.org/' in html_url and not "/pdf" in html_url and html_url.endswith("/"):
+    if '.jmir.org/' in html_url and "/pdf" not in html_url and html_url.endswith("/"):
         url = html_url + "pdf"
         return dict(pdf_url=url, technique='jmir-url')
 
     ### below here we are doing guesses
 
     # generic guess: try current URL plus .pdf, if it exists in the HTML body
-    if not '.pdf' in html_url:
+    if '.pdf' not in html_url:
         url = html_url + ".pdf"
         if url.encode('utf-8') in html_body:
             return dict(pdf_url=url, technique='guess-url-plus-pdf')
diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py
index ab0fd61..e2e673f 100644
--- a/python/sandcrawler/html_metadata.py
+++ b/python/sandcrawler/html_metadata.py
@@ -656,10 +656,10 @@ def html_extract_fulltext_url(doc_url: str, doc: HTMLParser,
     """
     self_doc_url: Optional[Tuple[str, str]] = None
     for pattern in patterns:
-        if not 'selector' in pattern:
+        if 'selector' not in pattern:
             continue
         if 'in_doc_url' in pattern:
-            if not pattern['in_doc_url'] in doc_url:
+            if pattern['in_doc_url'] not in doc_url:
                 continue
         elem = doc.css_first(pattern['selector'])
         if not elem:
@@ -669,14 +669,14 @@ def html_extract_fulltext_url(doc_url: str, doc: HTMLParser,
             val = elem.attrs.get(pattern['attr'])
         elif pattern.get('use_body'):
             val = elem.text()
-            if not '://' in val:
+            if '://' not in val:
                 continue
         if not val:
             continue
         val = urllib.parse.urljoin(doc_url, val)
         assert val
         if 'in_fulltext_url' in pattern:
-            if not pattern['in_fulltext_url'] in val:
+            if pattern['in_fulltext_url'] not in val:
                 continue
         for skip_pattern in FULLTEXT_URL_PATTERNS_SKIP:
             if skip_pattern in val.lower():
@@ -714,7 +714,7 @@ def html_extract_biblio(doc_url: str, doc: HTMLParser) -> Optional[BiblioMetadat
             if val_list:
                 for val in val_list:
                     if 'content' in val.attrs and val.attrs['content']:
-                        if not field in meta:
+                        if field not in meta:
                             meta[field] = []
                         meta[field].append(val.attrs['content'])
                 break
@@ -740,13 +740,13 @@ def html_extract_biblio(doc_url: str, doc: HTMLParser) -> Optional[BiblioMetadat
     raw_identifiers = meta.pop('raw_identifiers', [])
     for ident in raw_identifiers:
         if ident.startswith('doi:10.'):
-            if not 'doi' in meta:
+            if 'doi' not in meta:
                 meta['doi'] = ident.replace('doi:', '')
         elif ident.startswith('10.') and '/' in ident:
-            if not 'doi' in meta:
+            if 'doi' not in meta:
                 meta['doi'] = ident
         elif ident.startswith('isbn:'):
-            if not 'isbn' in meta:
+            if 'isbn' not in meta:
                 meta['isbn'] = ident.replace('isbn:', '')
 
     raw_date = meta.pop('raw_date', None)
@@ -813,7 +813,7 @@ def _extract_generic(doc: HTMLParser, selector: str, attrs: List[str],
 
     for node in doc.css(selector):
         for attr in attrs:
-            if not attr in node.attrs:
+            if attr not in node.attrs:
                 continue
             url = node.attrs.get(attr)
             # special-case a couple meta URI prefixes which don't match with adblock rules
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index aa4752e..b413bc8 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -399,7 +399,7 @@ class WaybackClient:
         """
         if not self.petabox_webdata_secret:
             raise Exception("WaybackClient needs petabox secret to do direct WARC fetches")
-        if not "/" in warc_path:
+        if "/" not in warc_path:
             raise ValueError(
                 "what looks like a liveweb/SPN temporary warc path: {}".format(warc_path))
         warc_uri = self.warc_uri_prefix + warc_path
@@ -579,9 +579,9 @@ class WaybackClient:
         #print(resp.url, file=sys.stderr)
 
         # defensively check that this is actually correct replay based on headers
-        if not "X-Archive-Src" in resp.headers:
+        if "X-Archive-Src" not in resp.headers:
             raise WaybackError("replay fetch didn't return X-Archive-Src in headers")
-        if not datetime in resp.url:
+        if datetime not in resp.url:
             raise WaybackError("didn't get exact reply (redirect?) datetime:{} got:{}".format(
                 datetime, resp.url))
 
@@ -634,9 +634,9 @@ class WaybackClient:
 
         # defensively check that this is actually correct replay based on headers
         # previously check for "X-Archive-Redirect-Reason" here
-        if not "X-Archive-Src" in resp.headers:
+        if "X-Archive-Src" not in resp.headers:
             raise WaybackError("redirect replay fetch didn't return X-Archive-Src in headers")
-        if not datetime in resp.url:
+        if datetime not in resp.url:
             raise WaybackError("didn't get exact reply (redirect?) datetime:{} got:{}".format(
                 datetime, resp.url))
 
@@ -772,7 +772,7 @@ class WaybackClient:
                             cdx=cdx_row,
                             revisit_cdx=None,
                         )
-                    if not "://" in resource.location:
+                    if "://" not in resource.location:
                         next_url = urllib.parse.urljoin(next_url, resource.location)
                     else:
                         next_url = resource.location
@@ -1087,7 +1087,7 @@ class SavePageNowClient:
         #print(spn_result, file=sys.stderr)
 
         # detect partial URL response (aka, success, but missing full URL)
-        if not "://" in spn_result.terminal_url or spn_result.terminal_url.startswith('/'):
+        if "://" not in spn_result.terminal_url or spn_result.terminal_url.startswith('/'):
             return ResourceResult(
                 start_url=start_url,
                 hit=False,
diff --git a/python/sandcrawler/misc.py b/python/sandcrawler/misc.py
index 83a4626..1c779ce 100644
--- a/python/sandcrawler/misc.py
+++ b/python/sandcrawler/misc.py
@@ -58,7 +58,7 @@ def gen_file_metadata(blob: bytes, allow_empty: bool = False) -> dict:
         # crude checks for XHTML or JATS XML, using only first 1 kB of file
         if b"<htm" in blob[:1024] and b'xmlns="http://www.w3.org/1999/xhtml"' in blob[:1024]:
             mimetype = "application/xhtml+xml"
-        elif b"<article " in blob[:1024] and not b"<html" in blob[:1024]:
+        elif b"<article " in blob[:1024] and b"<html" not in blob[:1024]:
             mimetype = "application/jats+xml"
     hashes = [
         hashlib.sha1(),
@@ -88,7 +88,7 @@ def gen_file_metadata_path(path: str, allow_empty: bool = False) -> dict:
             # crude checks for XHTML or JATS XML, using only first 1 kB of file
             if b"<htm" in blob[:1024] and b'xmlns="http://www.w3.org/1999/xhtml"' in blob[:1024]:
                 mimetype = "application/xhtml+xml"
-            elif b"<article " in blob[:1024] and not b"<html" in blob[:1024]:
+            elif b"<article " in blob[:1024] and b"<html" not in blob[:1024]:
                 mimetype = "application/jats+xml"
     hashes = [
         hashlib.sha1(),
diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py
index d47a8cb..8ec5979 100644
--- a/python/sandcrawler/persist.py
+++ b/python/sandcrawler/persist.py
@@ -86,7 +86,7 @@ class PersistIngestFileResultWorker(SandcrawlerWorker):
             raw['link_source_id'] = raw['fatcat']['release_ident']
 
         for k in ('ingest_type', 'base_url', 'link_source', 'link_source_id'):
-            if not k in raw:
+            if k not in raw:
                 self.counts['skip-request-fields'] += 1
                 return None
         if raw['ingest_type'] not in ('pdf', 'xml', 'html'):
@@ -120,10 +120,10 @@ class PersistIngestFileResultWorker(SandcrawlerWorker):
         if there is a problem with conversion, return None and set skip count
         """
         for k in ('request', 'hit', 'status'):
-            if not k in raw:
+            if k not in raw:
                 self.counts['skip-result-fields'] += 1
                 return None
-        if not 'base_url' in raw['request']:
+        if 'base_url' not in raw['request']:
             self.counts['skip-result-fields'] += 1
             return None
         ingest_type = raw['request'].get('ingest_type')
@@ -181,9 +181,9 @@ class PersistIngestFileResultWorker(SandcrawlerWorker):
         if there is a problem with conversion, return None and set skip count
         """
         for k in ('request', 'hit', 'status'):
-            if not k in raw:
+            if k not in raw:
                 return None
-        if not 'base_url' in raw['request']:
+        if 'base_url' not in raw['request']:
             return None
         ingest_type = raw['request'].get('ingest_type')
         if ingest_type not in ('dataset'):
author	Bryan Newbold <bnewbold@archive.org>	2021-10-26 18:12:23 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2021-10-26 18:12:23 -0700
commit	485dd2cfd120c52bbc5cc7745e44176d1003b40d (patch)
tree	966bf78a4bd3cc1f6c94efb8fc3054a8a441dab0 /python
parent	7087e7f65d8b81e29af44a43c1067bb2ec618c4e (diff)
download	sandcrawler-485dd2cfd120c52bbc5cc7745e44176d1003b40d.tar.gz sandcrawler-485dd2cfd120c52bbc5cc7745e44176d1003b40d.zip