diff options
-rwxr-xr-x | python/ia_pdf_match.py | 2 | ||||
-rw-r--r-- | python/sandcrawler/fileset_strategies.py | 2 | ||||
-rw-r--r-- | python/sandcrawler/html_metadata.py | 2 | ||||
-rw-r--r-- | python/sandcrawler/pdftrio.py | 2 | ||||
-rw-r--r-- | python/sandcrawler/workers.py | 2 |
5 files changed, 5 insertions, 5 deletions
diff --git a/python/ia_pdf_match.py b/python/ia_pdf_match.py index 8a95d95..493c9e7 100755 --- a/python/ia_pdf_match.py +++ b/python/ia_pdf_match.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 """ Input is IA item metadata JSON. -Ouput is insertable fatcat "match" JSON +Output is insertable fatcat "match" JSON - md5 - sha1 diff --git a/python/sandcrawler/fileset_strategies.py b/python/sandcrawler/fileset_strategies.py index 6dc77f9..cc88da2 100644 --- a/python/sandcrawler/fileset_strategies.py +++ b/python/sandcrawler/fileset_strategies.py @@ -92,7 +92,7 @@ class ArchiveorgFilesetStrategy(FilesetIngestStrategy): return existing if item.platform_name == "archiveorg": - raise PlatformScopeError("should't download archive.org into itself") + raise PlatformScopeError("shouldn't download archive.org into itself") local_dir = self.working_dir + item.archiveorg_item_name assert local_dir.startswith("/") diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py index 0d1a4a7..c46788e 100644 --- a/python/sandcrawler/html_metadata.py +++ b/python/sandcrawler/html_metadata.py @@ -843,7 +843,7 @@ def html_extract_resources( """ This function tries to find all the important resources in a page. The presumption is that the HTML document is article fulltext, and we want the - list of all resoures (by URL) necessary to replay the page. + list of all resources (by URL) necessary to replay the page. The returned resource URLs each have a type (script, img, css, etc), and should be fully-qualified URLs (not relative). diff --git a/python/sandcrawler/pdftrio.py b/python/sandcrawler/pdftrio.py index 0ca83b1..1119211 100644 --- a/python/sandcrawler/pdftrio.py +++ b/python/sandcrawler/pdftrio.py @@ -48,7 +48,7 @@ class PdfTrioClient(object): return { "status": "error-connect", "status_code": -2, # heritrix3 "HTTP connect" code - "error_msg": "pdftrio request connection timout", + "error_msg": "pdftrio request connection timeout", } info: Dict[str, Any] = dict(status_code=pdftrio_response.status_code) diff --git a/python/sandcrawler/workers.py b/python/sandcrawler/workers.py index cca2f2c..597a0ac 100644 --- a/python/sandcrawler/workers.py +++ b/python/sandcrawler/workers.py @@ -526,7 +526,7 @@ class KafkaJsonPusher(RecordPusher): while True: # TODO: this is batch-oriented, because underlying worker is # often batch-oriented, but this doesn't confirm that entire batch - # has been pushed to fatcat before commiting offset. Eg, consider + # has been pushed to fatcat before committing offset. Eg, consider # case where there there is one update and thousands of creates; # update would be lingering in worker, and if worker crashed # never created. Not great. |