aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-11-24 16:05:11 -0800
committerBryan Newbold <bnewbold@archive.org>2021-11-24 16:05:24 -0800
commitdfd13be5a7ac87b8b6c186986624f97da02b8923 (patch)
treec1242cf5ae9d0b9f48967008e8906c5fffc0bbc1
parenta6cfb01063da8a5172d38d2da190a25e7d070993 (diff)
downloadsandcrawler-dfd13be5a7ac87b8b6c186986624f97da02b8923.tar.gz
sandcrawler-dfd13be5a7ac87b8b6c186986624f97da02b8923.zip
codespell typos in python (comments)
-rwxr-xr-xpython/ia_pdf_match.py2
-rw-r--r--python/sandcrawler/fileset_strategies.py2
-rw-r--r--python/sandcrawler/html_metadata.py2
-rw-r--r--python/sandcrawler/pdftrio.py2
-rw-r--r--python/sandcrawler/workers.py2
5 files changed, 5 insertions, 5 deletions
diff --git a/python/ia_pdf_match.py b/python/ia_pdf_match.py
index 8a95d95..493c9e7 100755
--- a/python/ia_pdf_match.py
+++ b/python/ia_pdf_match.py
@@ -1,7 +1,7 @@
#!/usr/bin/env python3
"""
Input is IA item metadata JSON.
-Ouput is insertable fatcat "match" JSON
+Output is insertable fatcat "match" JSON
- md5
- sha1
diff --git a/python/sandcrawler/fileset_strategies.py b/python/sandcrawler/fileset_strategies.py
index 6dc77f9..cc88da2 100644
--- a/python/sandcrawler/fileset_strategies.py
+++ b/python/sandcrawler/fileset_strategies.py
@@ -92,7 +92,7 @@ class ArchiveorgFilesetStrategy(FilesetIngestStrategy):
return existing
if item.platform_name == "archiveorg":
- raise PlatformScopeError("should't download archive.org into itself")
+ raise PlatformScopeError("shouldn't download archive.org into itself")
local_dir = self.working_dir + item.archiveorg_item_name
assert local_dir.startswith("/")
diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py
index 0d1a4a7..c46788e 100644
--- a/python/sandcrawler/html_metadata.py
+++ b/python/sandcrawler/html_metadata.py
@@ -843,7 +843,7 @@ def html_extract_resources(
"""
This function tries to find all the important resources in a page. The
presumption is that the HTML document is article fulltext, and we want the
- list of all resoures (by URL) necessary to replay the page.
+ list of all resources (by URL) necessary to replay the page.
The returned resource URLs each have a type (script, img, css, etc), and
should be fully-qualified URLs (not relative).
diff --git a/python/sandcrawler/pdftrio.py b/python/sandcrawler/pdftrio.py
index 0ca83b1..1119211 100644
--- a/python/sandcrawler/pdftrio.py
+++ b/python/sandcrawler/pdftrio.py
@@ -48,7 +48,7 @@ class PdfTrioClient(object):
return {
"status": "error-connect",
"status_code": -2, # heritrix3 "HTTP connect" code
- "error_msg": "pdftrio request connection timout",
+ "error_msg": "pdftrio request connection timeout",
}
info: Dict[str, Any] = dict(status_code=pdftrio_response.status_code)
diff --git a/python/sandcrawler/workers.py b/python/sandcrawler/workers.py
index cca2f2c..597a0ac 100644
--- a/python/sandcrawler/workers.py
+++ b/python/sandcrawler/workers.py
@@ -526,7 +526,7 @@ class KafkaJsonPusher(RecordPusher):
while True:
# TODO: this is batch-oriented, because underlying worker is
# often batch-oriented, but this doesn't confirm that entire batch
- # has been pushed to fatcat before commiting offset. Eg, consider
+ # has been pushed to fatcat before committing offset. Eg, consider
# case where there there is one update and thousands of creates;
# update would be lingering in worker, and if worker crashed
# never created. Not great.