diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-10-26 12:54:37 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-10-26 12:54:37 -0700 |
commit | 05bd7cbcc62588e431c5efd533189e246b2a997e (patch) | |
tree | abcc707a451e77ea1e8c5ac9a5925b97a4bd139a /python/sandcrawler/ingest_file.py | |
parent | f3f424e42f2f4f383103cf80b30a00cfa6cfc179 (diff) | |
download | sandcrawler-05bd7cbcc62588e431c5efd533189e246b2a997e.tar.gz sandcrawler-05bd7cbcc62588e431c5efd533189e246b2a997e.zip |
make fmt
Diffstat (limited to 'python/sandcrawler/ingest_file.py')
-rw-r--r-- | python/sandcrawler/ingest_file.py | 145 |
1 files changed, 81 insertions, 64 deletions
diff --git a/python/sandcrawler/ingest_file.py b/python/sandcrawler/ingest_file.py index 137a793..b480cc2 100644 --- a/python/sandcrawler/ingest_file.py +++ b/python/sandcrawler/ingest_file.py @@ -1,4 +1,3 @@ - import base64 import gzip import json @@ -15,18 +14,22 @@ from selectolax.parser import HTMLParser from sandcrawler.db import SandcrawlerPostgrestClient from sandcrawler.grobid import GrobidClient from sandcrawler.html import extract_fulltext_url -from sandcrawler.html_metadata import BiblioMetadata, html_extract_biblio, html_extract_resources, load_adblock_rules -from sandcrawler.ia import (CdxApiClient, CdxApiError, NoCaptureError, PetaboxError, ResourceResult, SavePageNowClient, - SavePageNowError, WaybackClient, WaybackContentError, WaybackError, cdx_to_dict, +from sandcrawler.html_metadata import (BiblioMetadata, html_extract_biblio, + html_extract_resources, load_adblock_rules) +from sandcrawler.ia import (CdxApiClient, CdxApiError, NoCaptureError, PetaboxError, + ResourceResult, SavePageNowClient, SavePageNowError, WaybackClient, + WaybackContentError, WaybackError, cdx_to_dict, fix_transfer_encoding) -from sandcrawler.ingest_html import (WebResource, fetch_html_resources, html_extract_body_teixml, html_guess_platform, +from sandcrawler.ingest_html import (WebResource, fetch_html_resources, + html_extract_body_teixml, html_guess_platform, html_guess_scope, quick_fetch_html_resources) from sandcrawler.misc import clean_url, gen_file_metadata, parse_cdx_datetime from sandcrawler.pdfextract import PdfExtractResult, process_pdf from sandcrawler.workers import SandcrawlerWorker from sandcrawler.xml import xml_reserialize -MAX_BODY_SIZE_BYTES = 128*1024*1024 +MAX_BODY_SIZE_BYTES = 128 * 1024 * 1024 + class IngestFileWorker(SandcrawlerWorker): """ @@ -54,7 +57,6 @@ class IngestFileWorker(SandcrawlerWorker): process_file_hit(ResourceResult) -> response process_grobid(ResourceResult) """ - def __init__(self, sink=None, **kwargs): super().__init__() @@ -64,7 +66,8 @@ class IngestFileWorker(SandcrawlerWorker): self.wayback_client = WaybackClient() self.spn_client = kwargs.get('spn_client') if not self.spn_client: - self.spn_client = SavePageNowClient(spn_cdx_retry_sec=kwargs.get('spn_cdx_retry_sec', 9.0)) + self.spn_client = SavePageNowClient( + spn_cdx_retry_sec=kwargs.get('spn_cdx_retry_sec', 9.0)) self.grobid_client = kwargs.get('grobid_client') if not self.grobid_client: self.grobid_client = GrobidClient() @@ -123,13 +126,13 @@ class IngestFileWorker(SandcrawlerWorker): "fao.org/glis/", # Historical non-paper content: - "dhz.uni-passau.de/", # newspapers - "digital.ucd.ie/", # ireland national historical + "dhz.uni-passau.de/", # newspapers + "digital.ucd.ie/", # ireland national historical # DOI prefixes - "doi.org/10.2307/", # JSTOR; slow and many redirects - "doi.org/10.18730/", # fao.org: database entry - "doi.org/10.15468/", # gbif.org: database entry + "doi.org/10.2307/", # JSTOR; slow and many redirects + "doi.org/10.18730/", # fao.org: database entry + "doi.org/10.15468/", # gbif.org: database entry # deprecated domain (doesn't redirect correctly) "://edoc.mpg.de/", @@ -173,10 +176,10 @@ class IngestFileWorker(SandcrawlerWorker): "video/mpeg", "text/plain", "text/csv", - "text/x-r-source", # dataverse - "text/tab-separated-values", # dataverse - "text/x-rst", # dataverse - "application/x-rlang-transport", # dataverse + "text/x-r-source", # dataverse + "text/tab-separated-values", # dataverse + "text/x-rst", # dataverse + "application/x-rlang-transport", # dataverse "application/json", "application/xml", "application/pdf", @@ -194,7 +197,6 @@ class IngestFileWorker(SandcrawlerWorker): "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", ] - def check_existing_ingest(self, ingest_type: str, base_url: str) -> Optional[dict]: """ Check in sandcrawler-db (postgres) to see if we have already ingested @@ -214,7 +216,10 @@ class IngestFileWorker(SandcrawlerWorker): else: return None - def find_resource(self, url, best_mimetype=None, force_recrawl=False) -> Optional[ResourceResult]: + def find_resource(self, + url, + best_mimetype=None, + force_recrawl=False) -> Optional[ResourceResult]: """ Looks in wayback for a resource starting at the URL, following any redirects. If a hit isn't found, try crawling with SPN. @@ -222,7 +227,8 @@ class IngestFileWorker(SandcrawlerWorker): via = "none" resource = None - if url.startswith("http://web.archive.org/web/") or url.startswith("https://web.archive.org/web/"): + if url.startswith("http://web.archive.org/web/") or url.startswith( + "https://web.archive.org/web/"): raise NotImplementedError("handling direct wayback links not supported yet") if url.startswith("http://archive.org/") or url.startswith("https://archive.org/"): @@ -243,14 +249,13 @@ class IngestFileWorker(SandcrawlerWorker): if resource and not resource.hit and resource.terminal_dt and resource.terminal_dt < '20190000000000': old_failure = True - if self.try_spn2 and (resource == None or (resource and resource.status == 'no-capture') or soft404 or old_failure): + if self.try_spn2 and (resource == None or (resource and resource.status == 'no-capture') + or soft404 or old_failure): via = "spn2" resource = self.spn_client.crawl_resource(url, self.wayback_client) - print("[FETCH {:>6}] {} {}".format( - via, - (resource and resource.status), - (resource and resource.terminal_url) or url), - file=sys.stderr) + print("[FETCH {:>6}] {} {}".format(via, (resource and resource.status), + (resource and resource.terminal_url) or url), + file=sys.stderr) return resource def process_existing(self, request: dict, result_row: dict) -> dict: @@ -262,7 +267,8 @@ class IngestFileWorker(SandcrawlerWorker): assert result_row['hit'] existing_file_meta = self.pgrest_client.get_file_meta(result_row['terminal_sha1hex']) existing_grobid = self.pgrest_client.get_grobid(result_row['terminal_sha1hex']) - existing_cdx = self.pgrest_client.get_cdx(result_row['terminal_url'], result_row['terminal_dt']) + existing_cdx = self.pgrest_client.get_cdx(result_row['terminal_url'], + result_row['terminal_dt']) if not (existing_file_meta and existing_grobid and existing_cdx): raise NotImplementedError("partially-exsiting records not implemented yet") result = { @@ -281,11 +287,13 @@ class IngestFileWorker(SandcrawlerWorker): } return result - def process_file_hit(self, ingest_type: str, resource: ResourceResult, file_meta: dict) -> dict: + def process_file_hit(self, ingest_type: str, resource: ResourceResult, + file_meta: dict) -> dict: """ Run all the necessary processing for a new/fresh ingest hit. """ - if ingest_type in ["dataset-file", "component"] and file_meta['mimetype'] == "application/pdf": + if ingest_type in ["dataset-file", "component" + ] and file_meta['mimetype'] == "application/pdf": ingest_type = "pdf" if ingest_type == "pdf": return { @@ -396,24 +404,26 @@ class IngestFileWorker(SandcrawlerWorker): try: html_doc = HTMLParser(resource.body) except ValueError as ve: - return dict( - status="html-selectolax-error", - ) + return dict(status="html-selectolax-error", ) html_biblio = html_extract_biblio(resource.terminal_url, html_doc) assert html_biblio html_body = html_extract_body_teixml(resource.body) html_platform = html_guess_platform(resource.terminal_url, html_doc, html_biblio) - html_scope = html_guess_scope(resource.terminal_url, html_doc, html_biblio, html_body.get('word_count')) + html_scope = html_guess_scope(resource.terminal_url, html_doc, html_biblio, + html_body.get('word_count')) html_biblio_dict = json.loads(html_biblio.json(exclude_none=True)) - if html_scope in ('blocked-captcha','blocked-cookie','blocked-forbidden'): + if html_scope in ('blocked-captcha', 'blocked-cookie', 'blocked-forbidden'): return dict( status=html_scope, html_biblio=html_biblio_dict, scope=html_scope, platform=html_platform, ) - elif html_scope not in ('article-fulltext','unknown',): + elif html_scope not in ( + 'article-fulltext', + 'unknown', + ): html_body.pop("tei_xml", None) return dict( status="wrong-scope", @@ -423,7 +433,8 @@ class IngestFileWorker(SandcrawlerWorker): html_body=html_body, ) - raw_resources = html_extract_resources(resource.terminal_url, html_doc, self.adblock_rules) + raw_resources = html_extract_resources(resource.terminal_url, html_doc, + self.adblock_rules) if len(raw_resources) > self.max_html_resources: html_body.pop("tei_xml", None) return dict( @@ -452,7 +463,9 @@ class IngestFileWorker(SandcrawlerWorker): try: if self.html_quick_mode: print(" WARN: running quick CDX-only fetches", file=sys.stderr) - full_resources = quick_fetch_html_resources(raw_resources, self.wayback_client.cdx_client, when) + full_resources = quick_fetch_html_resources(raw_resources, + self.wayback_client.cdx_client, + when) else: full_resources = fetch_html_resources(raw_resources, self.wayback_client, when) except PetaboxError as e: @@ -572,7 +585,9 @@ class IngestFileWorker(SandcrawlerWorker): return result try: - resource = self.find_resource(next_url, best_mimetype, force_recrawl=force_recrawl) + resource = self.find_resource(next_url, + best_mimetype, + force_recrawl=force_recrawl) except SavePageNowError as e: result['status'] = 'spn2-error' result['error_message'] = str(e)[:1600] @@ -650,10 +665,9 @@ class IngestFileWorker(SandcrawlerWorker): # here we split based on ingest type to try and extract a next hop html_ish_resource = bool( "html" in file_meta['mimetype'] - or "xhtml" in file_meta['mimetype'] # matches "application/xhtml+xml" + or "xhtml" in file_meta['mimetype'] # matches "application/xhtml+xml" or "application/xml" in file_meta['mimetype'] - or "text/xml" in file_meta['mimetype'] - ) + or "text/xml" in file_meta['mimetype']) html_biblio = None html_doc = None if html_ish_resource and resource.body: @@ -662,7 +676,8 @@ class IngestFileWorker(SandcrawlerWorker): html_biblio = html_extract_biblio(resource.terminal_url, html_doc) if html_biblio: if not 'html_biblio' in result or html_biblio.title: - result['html_biblio'] = json.loads(html_biblio.json(exclude_none=True)) + result['html_biblio'] = json.loads( + html_biblio.json(exclude_none=True)) #print(f" setting html_biblio: {result['html_biblio']}", file=sys.stderr) except ValueError: pass @@ -686,18 +701,19 @@ class IngestFileWorker(SandcrawlerWorker): assert next_url next_url = clean_url(next_url) print("[PARSE {:>6}] {} {}".format( - ingest_type, - fulltext_url.get('technique'), - next_url, - ), - file=sys.stderr) + ingest_type, + fulltext_url.get('technique'), + next_url, + ), + file=sys.stderr) if next_url in hops: result['status'] = 'link-loop' result['error_message'] = "repeated: {}".format(next_url) return result hops.append(next_url) continue - elif ingest_type in ("xml", "html", "component") and html_ish_resource and html_biblio: + elif ingest_type in ("xml", "html", + "component") and html_ish_resource and html_biblio: # NOTE: src_fulltext_url is not a thing next_url_found = None if ingest_type == "xml" and html_biblio.xml_fulltext_url: @@ -711,11 +727,11 @@ class IngestFileWorker(SandcrawlerWorker): next_url = next_url_found technique = "html_biblio" print("[PARSE {:>6}] {} {}".format( - ingest_type, - technique, - next_url, - ), - file=sys.stderr) + ingest_type, + technique, + next_url, + ), + file=sys.stderr) if next_url in hops: if ingest_type == "html": # for HTML ingest, we don't count this as a link-loop @@ -756,7 +772,8 @@ class IngestFileWorker(SandcrawlerWorker): result['status'] = "wrong-mimetype" # formerly: "other-mimetype" return result elif ingest_type == "xml": - if file_meta['mimetype'] not in ("application/xml", "text/xml", "application/jats+xml"): + if file_meta['mimetype'] not in ("application/xml", "text/xml", + "application/jats+xml"): result['status'] = "wrong-mimetype" return result elif ingest_type == "html": @@ -786,18 +803,18 @@ class IngestFileWorker(SandcrawlerWorker): result['hit'] = True if ingest_type == "pdf": print("[SUCCESS {:>5}] sha1:{} grobid:{} pdfextract:{}".format( - ingest_type, - result.get('file_meta', {}).get('sha1hex'), - result.get('grobid', {}).get('status_code'), - result.get('pdf_meta', {}).get('status'), - ), - file=sys.stderr) + ingest_type, + result.get('file_meta', {}).get('sha1hex'), + result.get('grobid', {}).get('status_code'), + result.get('pdf_meta', {}).get('status'), + ), + file=sys.stderr) else: print("[SUCCESS {:>5}] sha1:{}".format( - ingest_type, - result.get('file_meta', {}).get('sha1hex'), - ), - file=sys.stderr) + ingest_type, + result.get('file_meta', {}).get('sha1hex'), + ), + file=sys.stderr) return result |