From dfd13be5a7ac87b8b6c186986624f97da02b8923 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 24 Nov 2021 16:05:11 -0800 Subject: codespell typos in python (comments) --- python/ia_pdf_match.py | 2 +- python/sandcrawler/fileset_strategies.py | 2 +- python/sandcrawler/html_metadata.py | 2 +- python/sandcrawler/pdftrio.py | 2 +- python/sandcrawler/workers.py | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/python/ia_pdf_match.py b/python/ia_pdf_match.py index 8a95d95..493c9e7 100755 --- a/python/ia_pdf_match.py +++ b/python/ia_pdf_match.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 """ Input is IA item metadata JSON. -Ouput is insertable fatcat "match" JSON +Output is insertable fatcat "match" JSON - md5 - sha1 diff --git a/python/sandcrawler/fileset_strategies.py b/python/sandcrawler/fileset_strategies.py index 6dc77f9..cc88da2 100644 --- a/python/sandcrawler/fileset_strategies.py +++ b/python/sandcrawler/fileset_strategies.py @@ -92,7 +92,7 @@ class ArchiveorgFilesetStrategy(FilesetIngestStrategy): return existing if item.platform_name == "archiveorg": - raise PlatformScopeError("should't download archive.org into itself") + raise PlatformScopeError("shouldn't download archive.org into itself") local_dir = self.working_dir + item.archiveorg_item_name assert local_dir.startswith("/") diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py index 0d1a4a7..c46788e 100644 --- a/python/sandcrawler/html_metadata.py +++ b/python/sandcrawler/html_metadata.py @@ -843,7 +843,7 @@ def html_extract_resources( """ This function tries to find all the important resources in a page. The presumption is that the HTML document is article fulltext, and we want the - list of all resoures (by URL) necessary to replay the page. + list of all resources (by URL) necessary to replay the page. The returned resource URLs each have a type (script, img, css, etc), and should be fully-qualified URLs (not relative). diff --git a/python/sandcrawler/pdftrio.py b/python/sandcrawler/pdftrio.py index 0ca83b1..1119211 100644 --- a/python/sandcrawler/pdftrio.py +++ b/python/sandcrawler/pdftrio.py @@ -48,7 +48,7 @@ class PdfTrioClient(object): return { "status": "error-connect", "status_code": -2, # heritrix3 "HTTP connect" code - "error_msg": "pdftrio request connection timout", + "error_msg": "pdftrio request connection timeout", } info: Dict[str, Any] = dict(status_code=pdftrio_response.status_code) diff --git a/python/sandcrawler/workers.py b/python/sandcrawler/workers.py index cca2f2c..597a0ac 100644 --- a/python/sandcrawler/workers.py +++ b/python/sandcrawler/workers.py @@ -526,7 +526,7 @@ class KafkaJsonPusher(RecordPusher): while True: # TODO: this is batch-oriented, because underlying worker is # often batch-oriented, but this doesn't confirm that entire batch - # has been pushed to fatcat before commiting offset. Eg, consider + # has been pushed to fatcat before committing offset. Eg, consider # case where there there is one update and thousands of creates; # update would be lingering in worker, and if worker crashed # never created. Not great. -- cgit v1.2.3