aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-10-26 18:12:23 -0700
committerBryan Newbold <bnewbold@archive.org>2021-10-26 18:12:23 -0700
commit485dd2cfd120c52bbc5cc7745e44176d1003b40d (patch)
tree966bf78a4bd3cc1f6c94efb8fc3054a8a441dab0 /python
parent7087e7f65d8b81e29af44a43c1067bb2ec618c4e (diff)
downloadsandcrawler-485dd2cfd120c52bbc5cc7745e44176d1003b40d.tar.gz
sandcrawler-485dd2cfd120c52bbc5cc7745e44176d1003b40d.zip
lint collection membership (last lint for now)
Diffstat (limited to 'python')
-rw-r--r--python/sandcrawler/db.py2
-rw-r--r--python/sandcrawler/fileset_platforms.py4
-rw-r--r--python/sandcrawler/html.py12
-rw-r--r--python/sandcrawler/html_metadata.py18
-rw-r--r--python/sandcrawler/ia.py14
-rw-r--r--python/sandcrawler/misc.py4
-rw-r--r--python/sandcrawler/persist.py10
7 files changed, 32 insertions, 32 deletions
diff --git a/python/sandcrawler/db.py b/python/sandcrawler/db.py
index 05fedc6..ee4d3bf 100644
--- a/python/sandcrawler/db.py
+++ b/python/sandcrawler/db.py
@@ -208,7 +208,7 @@ class SandcrawlerPostgresClient:
# though (to save database space)
dupe_fields = ('fatcat_release', 'grobid_version')
for k in dupe_fields:
- if not k in r:
+ if k not in r:
r[k] = r['metadata'].get(k)
r['metadata'].pop(k, None)
r['metadata'] = json.dumps(r['metadata'], sort_keys=True)
diff --git a/python/sandcrawler/fileset_platforms.py b/python/sandcrawler/fileset_platforms.py
index 6d66d81..b6808b5 100644
--- a/python/sandcrawler/fileset_platforms.py
+++ b/python/sandcrawler/fileset_platforms.py
@@ -369,7 +369,7 @@ class FigshareHelper(FilesetPlatformHelper):
platform_domain = components.netloc.split(':')[0].lower()
# only work with full, versioned figshare.com URLs
- if not 'figshare.com' in platform_domain:
+ if 'figshare.com' not in platform_domain:
return False
try:
@@ -537,7 +537,7 @@ class ZenodoHelper(FilesetPlatformHelper):
platform_id = components.path.split('/')[2]
assert platform_id.isdigit(), f"expected numeric: {platform_id}"
- if not 'zenodo.org' in platform_domain:
+ if 'zenodo.org' not in platform_domain:
raise PlatformScopeError(f"unexpected zenodo.org domain: {platform_domain}")
# 2. API fetch
diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py
index abd3d50..4d36573 100644
--- a/python/sandcrawler/html.py
+++ b/python/sandcrawler/html.py
@@ -48,7 +48,7 @@ def extract_fulltext_url(html_url: str, html_body: bytes) -> Dict[str, str]:
if meta and not meta.get('content'):
meta = None
# wiley has a weird almost-blank page we don't want to loop on
- if meta and not "://onlinelibrary.wiley.com/doi/pdf/" in html_url:
+ if meta and "://onlinelibrary.wiley.com/doi/pdf/" not in html_url:
url = meta['content'].strip()
if '://doi.org/' in url:
print(f"\tdoi.org in citation_pdf_url (loop?): {url}", file=sys.stderr)
@@ -198,7 +198,7 @@ def extract_fulltext_url(html_url: str, html_body: bytes) -> Dict[str, str]:
# american archivist (OA)
# https://americanarchivist.org/doi/abs/10.17723/aarc.62.2.j475270470145630
- if "://americanarchivist.org/doi/" in html_url and not "/doi/pdf" in html_url:
+ if "://americanarchivist.org/doi/" in html_url and "/doi/pdf" not in html_url:
# use a more aggressive direct guess to avoid rate-limiting...
if "/doi/10." in html_url:
url = html_url.replace("/doi/10.", "/doi/pdf/10.")
@@ -240,7 +240,7 @@ def extract_fulltext_url(html_url: str, html_body: bytes) -> Dict[str, str]:
# www.ahajournals.org
# https://www.ahajournals.org/doi/10.1161/circ.110.19.2977
- if "://www.ahajournals.org/doi/" in html_url and not '/doi/pdf/' in html_url:
+ if "://www.ahajournals.org/doi/" in html_url and '/doi/pdf/' not in html_url:
# <a href="/doi/pdf/10.1161/circ.110.19.2977?download=true">PDF download</a>
if b'/doi/pdf/10.' in html_body:
url = html_url.replace('/doi/10.', '/doi/pdf/10.')
@@ -259,7 +259,7 @@ def extract_fulltext_url(html_url: str, html_body: bytes) -> Dict[str, str]:
# cogentoa.com
# https://www.cogentoa.com/article/10.1080/23311975.2017.1412873
- if "://www.cogentoa.com/article/" in html_url and not ".pdf" in html_url:
+ if "://www.cogentoa.com/article/" in html_url and ".pdf" not in html_url:
# blech, it's a SPA! All JS
# https://www.cogentoa.com/article/10.1080/23311975.2017.1412873.pdf
url = html_url + ".pdf"
@@ -321,14 +321,14 @@ def extract_fulltext_url(html_url: str, html_body: bytes) -> Dict[str, str]:
# JMIR
# https://mhealth.jmir.org/2020/7/e17891/
- if '.jmir.org/' in html_url and not "/pdf" in html_url and html_url.endswith("/"):
+ if '.jmir.org/' in html_url and "/pdf" not in html_url and html_url.endswith("/"):
url = html_url + "pdf"
return dict(pdf_url=url, technique='jmir-url')
### below here we are doing guesses
# generic guess: try current URL plus .pdf, if it exists in the HTML body
- if not '.pdf' in html_url:
+ if '.pdf' not in html_url:
url = html_url + ".pdf"
if url.encode('utf-8') in html_body:
return dict(pdf_url=url, technique='guess-url-plus-pdf')
diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py
index ab0fd61..e2e673f 100644
--- a/python/sandcrawler/html_metadata.py
+++ b/python/sandcrawler/html_metadata.py
@@ -656,10 +656,10 @@ def html_extract_fulltext_url(doc_url: str, doc: HTMLParser,
"""
self_doc_url: Optional[Tuple[str, str]] = None
for pattern in patterns:
- if not 'selector' in pattern:
+ if 'selector' not in pattern:
continue
if 'in_doc_url' in pattern:
- if not pattern['in_doc_url'] in doc_url:
+ if pattern['in_doc_url'] not in doc_url:
continue
elem = doc.css_first(pattern['selector'])
if not elem:
@@ -669,14 +669,14 @@ def html_extract_fulltext_url(doc_url: str, doc: HTMLParser,
val = elem.attrs.get(pattern['attr'])
elif pattern.get('use_body'):
val = elem.text()
- if not '://' in val:
+ if '://' not in val:
continue
if not val:
continue
val = urllib.parse.urljoin(doc_url, val)
assert val
if 'in_fulltext_url' in pattern:
- if not pattern['in_fulltext_url'] in val:
+ if pattern['in_fulltext_url'] not in val:
continue
for skip_pattern in FULLTEXT_URL_PATTERNS_SKIP:
if skip_pattern in val.lower():
@@ -714,7 +714,7 @@ def html_extract_biblio(doc_url: str, doc: HTMLParser) -> Optional[BiblioMetadat
if val_list:
for val in val_list:
if 'content' in val.attrs and val.attrs['content']:
- if not field in meta:
+ if field not in meta:
meta[field] = []
meta[field].append(val.attrs['content'])
break
@@ -740,13 +740,13 @@ def html_extract_biblio(doc_url: str, doc: HTMLParser) -> Optional[BiblioMetadat
raw_identifiers = meta.pop('raw_identifiers', [])
for ident in raw_identifiers:
if ident.startswith('doi:10.'):
- if not 'doi' in meta:
+ if 'doi' not in meta:
meta['doi'] = ident.replace('doi:', '')
elif ident.startswith('10.') and '/' in ident:
- if not 'doi' in meta:
+ if 'doi' not in meta:
meta['doi'] = ident
elif ident.startswith('isbn:'):
- if not 'isbn' in meta:
+ if 'isbn' not in meta:
meta['isbn'] = ident.replace('isbn:', '')
raw_date = meta.pop('raw_date', None)
@@ -813,7 +813,7 @@ def _extract_generic(doc: HTMLParser, selector: str, attrs: List[str],
for node in doc.css(selector):
for attr in attrs:
- if not attr in node.attrs:
+ if attr not in node.attrs:
continue
url = node.attrs.get(attr)
# special-case a couple meta URI prefixes which don't match with adblock rules
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index aa4752e..b413bc8 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -399,7 +399,7 @@ class WaybackClient:
"""
if not self.petabox_webdata_secret:
raise Exception("WaybackClient needs petabox secret to do direct WARC fetches")
- if not "/" in warc_path:
+ if "/" not in warc_path:
raise ValueError(
"what looks like a liveweb/SPN temporary warc path: {}".format(warc_path))
warc_uri = self.warc_uri_prefix + warc_path
@@ -579,9 +579,9 @@ class WaybackClient:
#print(resp.url, file=sys.stderr)
# defensively check that this is actually correct replay based on headers
- if not "X-Archive-Src" in resp.headers:
+ if "X-Archive-Src" not in resp.headers:
raise WaybackError("replay fetch didn't return X-Archive-Src in headers")
- if not datetime in resp.url:
+ if datetime not in resp.url:
raise WaybackError("didn't get exact reply (redirect?) datetime:{} got:{}".format(
datetime, resp.url))
@@ -634,9 +634,9 @@ class WaybackClient:
# defensively check that this is actually correct replay based on headers
# previously check for "X-Archive-Redirect-Reason" here
- if not "X-Archive-Src" in resp.headers:
+ if "X-Archive-Src" not in resp.headers:
raise WaybackError("redirect replay fetch didn't return X-Archive-Src in headers")
- if not datetime in resp.url:
+ if datetime not in resp.url:
raise WaybackError("didn't get exact reply (redirect?) datetime:{} got:{}".format(
datetime, resp.url))
@@ -772,7 +772,7 @@ class WaybackClient:
cdx=cdx_row,
revisit_cdx=None,
)
- if not "://" in resource.location:
+ if "://" not in resource.location:
next_url = urllib.parse.urljoin(next_url, resource.location)
else:
next_url = resource.location
@@ -1087,7 +1087,7 @@ class SavePageNowClient:
#print(spn_result, file=sys.stderr)
# detect partial URL response (aka, success, but missing full URL)
- if not "://" in spn_result.terminal_url or spn_result.terminal_url.startswith('/'):
+ if "://" not in spn_result.terminal_url or spn_result.terminal_url.startswith('/'):
return ResourceResult(
start_url=start_url,
hit=False,
diff --git a/python/sandcrawler/misc.py b/python/sandcrawler/misc.py
index 83a4626..1c779ce 100644
--- a/python/sandcrawler/misc.py
+++ b/python/sandcrawler/misc.py
@@ -58,7 +58,7 @@ def gen_file_metadata(blob: bytes, allow_empty: bool = False) -> dict:
# crude checks for XHTML or JATS XML, using only first 1 kB of file
if b"<htm" in blob[:1024] and b'xmlns="http://www.w3.org/1999/xhtml"' in blob[:1024]:
mimetype = "application/xhtml+xml"
- elif b"<article " in blob[:1024] and not b"<html" in blob[:1024]:
+ elif b"<article " in blob[:1024] and b"<html" not in blob[:1024]:
mimetype = "application/jats+xml"
hashes = [
hashlib.sha1(),
@@ -88,7 +88,7 @@ def gen_file_metadata_path(path: str, allow_empty: bool = False) -> dict:
# crude checks for XHTML or JATS XML, using only first 1 kB of file
if b"<htm" in blob[:1024] and b'xmlns="http://www.w3.org/1999/xhtml"' in blob[:1024]:
mimetype = "application/xhtml+xml"
- elif b"<article " in blob[:1024] and not b"<html" in blob[:1024]:
+ elif b"<article " in blob[:1024] and b"<html" not in blob[:1024]:
mimetype = "application/jats+xml"
hashes = [
hashlib.sha1(),
diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py
index d47a8cb..8ec5979 100644
--- a/python/sandcrawler/persist.py
+++ b/python/sandcrawler/persist.py
@@ -86,7 +86,7 @@ class PersistIngestFileResultWorker(SandcrawlerWorker):
raw['link_source_id'] = raw['fatcat']['release_ident']
for k in ('ingest_type', 'base_url', 'link_source', 'link_source_id'):
- if not k in raw:
+ if k not in raw:
self.counts['skip-request-fields'] += 1
return None
if raw['ingest_type'] not in ('pdf', 'xml', 'html'):
@@ -120,10 +120,10 @@ class PersistIngestFileResultWorker(SandcrawlerWorker):
if there is a problem with conversion, return None and set skip count
"""
for k in ('request', 'hit', 'status'):
- if not k in raw:
+ if k not in raw:
self.counts['skip-result-fields'] += 1
return None
- if not 'base_url' in raw['request']:
+ if 'base_url' not in raw['request']:
self.counts['skip-result-fields'] += 1
return None
ingest_type = raw['request'].get('ingest_type')
@@ -181,9 +181,9 @@ class PersistIngestFileResultWorker(SandcrawlerWorker):
if there is a problem with conversion, return None and set skip count
"""
for k in ('request', 'hit', 'status'):
- if not k in raw:
+ if k not in raw:
return None
- if not 'base_url' in raw['request']:
+ if 'base_url' not in raw['request']:
return None
ingest_type = raw['request'].get('ingest_type')
if ingest_type not in ('dataset'):