codespell typos in python (comments)

author: Bryan Newbold <bnewbold@archive.org> 2021-11-24 16:05:11 -0800
committer: Bryan Newbold <bnewbold@archive.org> 2021-11-24 16:05:24 -0800
commit: dfd13be5a7ac87b8b6c186986624f97da02b8923 (patch)
tree: c1242cf5ae9d0b9f48967008e8906c5fffc0bbc1 /python
parent: a6cfb01063da8a5172d38d2da190a25e7d070993 (diff)
download: sandcrawler-dfd13be5a7ac87b8b6c186986624f97da02b8923.tar.gz
sandcrawler-dfd13be5a7ac87b8b6c186986624f97da02b8923.zip
5 files changed, 5 insertions, 5 deletions
diff --git a/python/ia_pdf_match.py b/python/ia_pdf_match.py
index 8a95d95..493c9e7 100755
--- a/python/ia_pdf_match.py
+++ b/python/ia_pdf_match.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 """
 Input is IA item metadata JSON.
-Ouput is insertable fatcat "match" JSON
+Output is insertable fatcat "match" JSON
 
 - md5
 - sha1
diff --git a/python/sandcrawler/fileset_strategies.py b/python/sandcrawler/fileset_strategies.py
index 6dc77f9..cc88da2 100644
--- a/python/sandcrawler/fileset_strategies.py
+++ b/python/sandcrawler/fileset_strategies.py
@@ -92,7 +92,7 @@ class ArchiveorgFilesetStrategy(FilesetIngestStrategy):
             return existing
 
         if item.platform_name == "archiveorg":
-            raise PlatformScopeError("should't download archive.org into itself")
+            raise PlatformScopeError("shouldn't download archive.org into itself")
 
         local_dir = self.working_dir + item.archiveorg_item_name
         assert local_dir.startswith("/")
diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py
index 0d1a4a7..c46788e 100644
--- a/python/sandcrawler/html_metadata.py
+++ b/python/sandcrawler/html_metadata.py
@@ -843,7 +843,7 @@ def html_extract_resources(
     """
     This function tries to find all the important resources in a page. The
     presumption is that the HTML document is article fulltext, and we want the
-    list of all resoures (by URL) necessary to replay the page.
+    list of all resources (by URL) necessary to replay the page.
 
     The returned resource URLs each have a type (script, img, css, etc), and
     should be fully-qualified URLs (not relative).
diff --git a/python/sandcrawler/pdftrio.py b/python/sandcrawler/pdftrio.py
index 0ca83b1..1119211 100644
--- a/python/sandcrawler/pdftrio.py
+++ b/python/sandcrawler/pdftrio.py
@@ -48,7 +48,7 @@ class PdfTrioClient(object):
             return {
                 "status": "error-connect",
                 "status_code": -2,  # heritrix3 "HTTP connect" code
-                "error_msg": "pdftrio request connection timout",
+                "error_msg": "pdftrio request connection timeout",
             }
 
         info: Dict[str, Any] = dict(status_code=pdftrio_response.status_code)
diff --git a/python/sandcrawler/workers.py b/python/sandcrawler/workers.py
index cca2f2c..597a0ac 100644
--- a/python/sandcrawler/workers.py
+++ b/python/sandcrawler/workers.py
@@ -526,7 +526,7 @@ class KafkaJsonPusher(RecordPusher):
         while True:
             # TODO: this is batch-oriented, because underlying worker is
             # often batch-oriented, but this doesn't confirm that entire batch
-            # has been pushed to fatcat before commiting offset. Eg, consider
+            # has been pushed to fatcat before committing offset. Eg, consider
             # case where there there is one update and thousands of creates;
             # update would be lingering in worker, and if worker crashed
             # never created. Not great.
author	Bryan Newbold <bnewbold@archive.org>	2021-11-24 16:05:11 -0800
committer	Bryan Newbold <bnewbold@archive.org>	2021-11-24 16:05:24 -0800
commit	dfd13be5a7ac87b8b6c186986624f97da02b8923 (patch)
tree	c1242cf5ae9d0b9f48967008e8906c5fffc0bbc1 /python
parent	a6cfb01063da8a5172d38d2da190a25e7d070993 (diff)
download	sandcrawler-dfd13be5a7ac87b8b6c186986624f97da02b8923.tar.gz sandcrawler-dfd13be5a7ac87b8b6c186986624f97da02b8923.zip