diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-10-04 13:01:58 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-10-15 18:15:20 -0700 |
commit | e2e0602114ccdf142b3ef0f30c67d2cb7a58ef7e (patch) | |
tree | bee2b4343f30d59fec463dbdcaafafc11d7cd513 | |
parent | 452475df7619f3743eac5ad86e2e1fb8ba9972da (diff) | |
download | sandcrawler-e2e0602114ccdf142b3ef0f30c67d2cb7a58ef7e.tar.gz sandcrawler-e2e0602114ccdf142b3ef0f30c67d2cb7a58ef7e.zip |
progress on fileset/dataset ingest
-rw-r--r-- | python/sandcrawler/fileset_platforms.py | 39 | ||||
-rw-r--r-- | python/sandcrawler/fileset_strategies.py | 22 | ||||
-rw-r--r-- | python/sandcrawler/fileset_types.py | 43 | ||||
-rw-r--r-- | python/sandcrawler/ingest_fileset.py | 299 |
4 files changed, 403 insertions, 0 deletions
diff --git a/python/sandcrawler/fileset_platforms.py b/python/sandcrawler/fileset_platforms.py new file mode 100644 index 0000000..7aeacf2 --- /dev/null +++ b/python/sandcrawler/fileset_platforms.py @@ -0,0 +1,39 @@ + +import sys +import json +import gzip +import time +from collections import namedtuple +from typing import Optional, Tuple, Any, Dict, List + +from sandcrawler.html_metadata import BiblioMetadata +from sandcrawler.ia import ResourceResult + + +class DatasetPlatformHelper(class): + + def __init__(): + self.platform_name = 'unknown' + + def match_request(request: dict , resource: ResourceResult, html_biblio: Optional[BiblioMetadata]) -> bool: + """ + Does this request look like it matches this platform? + """ + raise NotImplemented + + def get_item(request: dict, resource: ResourceResult, html_biblio: Optional[BiblioMetadata]) -> DatasetPlatformItem: + """ + Fetch platform-specific metadata for this request (eg, via API calls) + """ + raise NotImplemented + + +class DataverseHelper(DatasetPlatformHelper): + + def __init__(): + self.platform_name = 'dataverse' + +class ArchiveOrgHelper(DatasetPlatformHelper): + + def __init__(): + self.platform_name = 'archiveorg' diff --git a/python/sandcrawler/fileset_strategies.py b/python/sandcrawler/fileset_strategies.py new file mode 100644 index 0000000..592b475 --- /dev/null +++ b/python/sandcrawler/fileset_strategies.py @@ -0,0 +1,22 @@ + +import sys +import json +import gzip +import time +from collections import namedtuple +from typing import Optional, Tuple, Any, Dict, List + +from sandcrawler.html_metadata import BiblioMetadata +from sandcrawler.ia import ResourceResult +from sandcrawler.fileset_types import IngestStrategy, FilesetManifestFile, DatasetPlatformItem + + +class FilesetIngestStrategy(class): + + def __init__(): + self.ingest_strategy = 'unknown' + + def check_existing(): # XXX: -> Any: + raise NotImplementedError() + + def process(item: DatasetPlatformItem): diff --git a/python/sandcrawler/fileset_types.py b/python/sandcrawler/fileset_types.py new file mode 100644 index 0000000..f0f03db --- /dev/null +++ b/python/sandcrawler/fileset_types.py @@ -0,0 +1,43 @@ + +from enum import Enum + +from pydantic import BaseModel + +class IngestStrategy(str, Enum): + WebFile = "web-file" + WebFileset = "web-fileset" + WebFilesetBundled = "web-fileset-bundled" + ArchiveorgFile = "archiveorg-file" + ArchiveorgFileset = "archiveorg-fileset" + ArchiveorgFilesetBundled = "archiveorg-fileset-bundled" + +class FilesetManifestFile(BaseModel): + path: str + size: Optional[int] + md5: Optional[str] + sha1: Optional[str] + sha256: Optional[str] + mimetype: Optional[str] + + status: Optional[str] + platform_url: Optional[str] + terminal_url: Optional[str] + terminal_dt: Optional[str] + extra: Optional[Dict[str, Any]] + +class DatasetPlatformItem(BaseModel): + platform_name: str + platform_status: str + manifest: Optional[List[FilesetManifestFile]] + + platform_domain: Optional[str] + platform_id: Optional[str] + archiveorg_item_name: Optional[str] + archiveorg_collection: Optional[str] + web_base_url: Optional[str] + web_bundle_url: Optional[str] + +class ArchiveStrategyResult(BaseModel): + ingest_strategy: str + status: str + manifest: List[FilesetManifestFile] diff --git a/python/sandcrawler/ingest_fileset.py b/python/sandcrawler/ingest_fileset.py new file mode 100644 index 0000000..9ffaa47 --- /dev/null +++ b/python/sandcrawler/ingest_fileset.py @@ -0,0 +1,299 @@ + +import sys +import json +import gzip +import time +from collections import namedtuple +from typing import Optional, Tuple, Any, Dict, List + +import requests +from selectolax.parser import HTMLParser + +from sandcrawler.ia import SavePageNowClient, CdxApiClient, WaybackClient, WaybackError, WaybackContentError, SavePageNowError, CdxApiError, PetaboxError, cdx_to_dict, ResourceResult, fix_transfer_encoding, NoCaptureError +from sandcrawler.misc import gen_file_metadata, clean_url, parse_cdx_datetime +from sandcrawler.html import extract_fulltext_url +from sandcrawler.html_ingest import fetch_html_resources, \ + quick_fetch_html_resources, html_guess_scope, html_extract_body_teixml, \ + WebResource, html_guess_platform + +from sandcrawler.html_metadata import BiblioMetadata, html_extract_resources, html_extract_biblio, load_adblock_rules +from sandcrawler.workers import SandcrawlerWorker +from sandcrawler.db import SandcrawlerPostgrestClient + + +MAX_BODY_SIZE_BYTES = 128*1024*1024 + +class IngestFilesetWorker(IngestFileWorker): + """ + General process is: + + 1. crawl base_url, and use request and landing page resource (eg, HTML) to + determine platform being targeted + 2. use platform-specific helper to fetch metadata about the work, including + a manifest of files, and selection of an "ingest strategy" and any + required context + 3. then use strategy-specific helper to archive files from manifest (first + checking to see if content has been archived already) + 4. summarize status + """ + + def __init__(self, sink=None, **kwargs): + super().__init__(sink=None, **kwargs) + + self.sink = sink + + + def check_existing_ingest(self, ingest_type: str, base_url: str) -> Optional[dict]: + """ + Same as file version, but uses fileset result table + """ + if not self.try_existing_ingest: + return None + existing = self.pgrest_client.get_ingest_fileset_result(ingest_type, base_url) + # TODO: filter on more flags? + if existing and existing['hit'] == True: + return existing + else: + return None + + def process_existing(self, request: dict, result_row: dict) -> dict: + """ + If we have an existing ingest fileset result, do any database fetches + or additional processing necessary to return a result. + """ + raise NotImplementedError("process_existing() not tested or safe yet") + + # XXX: use file version + #def process_hit(self, ingest_type: str, resource: ResourceResult, file_meta: dict) -> dict: + + def want(self, request: dict) -> bool: + if not request.get('ingest_type') in ('dataset',): + return False + return True + + def process(self, request: dict, key: Any = None) -> dict: + + ingest_type = request.get('ingest_type') + if ingest_type not in ("dataset",): + raise NotImplementedError(f"can't handle ingest_type={ingest_type}") + + # parse/clean URL + # note that we pass through the original/raw URL, and that is what gets + # persisted in database table + base_url = clean_url(request['base_url']) + + force_recrawl = bool(request.get('force_recrawl', False)) + + for block in self.base_url_blocklist: + if block in base_url: + print("[SKIP {:>6}] {}".format(ingest_type, base_url), file=sys.stderr) + return dict(request=request, hit=False, status="skip-url-blocklist") + + print("[INGEST {:>6}] {}".format(ingest_type, base_url), file=sys.stderr) + + # TODO + #existing = self.check_existing_ingest(ingest_type, base_url) + #if existing: + # return self.process_existing(request, existing) + + result: Dict[str, Any] = dict(request=request, hit=False) + hops = [base_url] + + # 1. Determine `platform`, which may involve resolving redirects and crawling a landing page. + + ### START COPYPASTA from process_file(), should refactor ### + + # check against blocklist + for block in self.base_url_blocklist: + if block in next_url: + result['status'] = "skip-url-blocklist" + return result + + try: + resource = self.find_resource(next_url, best_mimetype, force_recrawl=force_recrawl) + except SavePageNowError as e: + result['status'] = 'spn2-error' + result['error_message'] = str(e)[:1600] + return result + except PetaboxError as e: + result['status'] = 'petabox-error' + result['error_message'] = str(e)[:1600] + return result + except CdxApiError as e: + result['status'] = 'cdx-error' + result['error_message'] = str(e)[:1600] + # add a sleep in cdx-error path as a slow-down + time.sleep(2.0) + return result + except WaybackError as e: + result['status'] = 'wayback-error' + result['error_message'] = str(e)[:1600] + return result + except WaybackContentError as e: + result['status'] = 'wayback-content-error' + result['error_message'] = str(e)[:1600] + return result + except NotImplementedError as e: + result['status'] = 'not-implemented' + result['error_message'] = str(e)[:1600] + return result + + assert resource + + if resource.terminal_url: + result['terminal'] = { + "terminal_url": resource.terminal_url, + "terminal_dt": resource.terminal_dt, + "terminal_status_code": resource.terminal_status_code, + } + if resource.terminal_url not in result['hops']: + result['hops'].append(resource.terminal_url) + + if not resource.hit: + result['status'] = resource.status + return result + + if resource.terminal_url: + for pattern in self.base_url_blocklist: + if pattern in resource.terminal_url: + result['status'] = 'skip-url-blocklist' + return result + + if resource.terminal_url: + for pattern in self.cookie_blocklist: + if pattern in resource.terminal_url: + result['status'] = 'blocked-cookie' + return result + + if not resource.body: + result['status'] = 'null-body' + return result + + if len(resource.body) > MAX_BODY_SIZE_BYTES: + result['status'] = 'body-too-large' + return result + + file_meta = gen_file_metadata(resource.body) + try: + file_meta, resource = fix_transfer_encoding(file_meta, resource) + except Exception as e: + result['status'] = 'bad-gzip-encoding' + result['error_message'] = str(e) + return result + + if not resource.body or file_meta['size_bytes'] == 0: + result['status'] = 'null-body' + return result + + # here we split based on ingest type to try and extract a next hop + html_ish_resource = bool( + "html" in file_meta['mimetype'] + or "xhtml" in file_meta['mimetype'] # matches "application/xhtml+xml" + or "application/xml" in file_meta['mimetype'] + or "text/xml" in file_meta['mimetype'] + ) + html_biblio = None + html_doc = None + if html_ish_resource and resource.body: + try: + html_doc = HTMLParser(resource.body) + html_biblio = html_extract_biblio(resource.terminal_url, html_doc) + if html_biblio: + if not 'html_biblio' in result or html_biblio.title: + result['html_biblio'] = json.loads(html_biblio.json(exclude_none=True)) + #print(f" setting html_biblio: {result['html_biblio']}", file=sys.stderr) + except ValueError: + pass + + # fetch must be a hit if we got this far (though not necessarily an ingest hit!) + assert resource + assert resource.hit == True + assert resource.terminal_status_code in (200, 226) + + if resource.terminal_url: + result['terminal'] = { + "terminal_url": resource.terminal_url, + "terminal_dt": resource.terminal_dt, + "terminal_status_code": resource.terminal_status_code, + "terminal_sha1hex": file_meta['sha1hex'], + } + + result['file_meta'] = file_meta + result['cdx'] = cdx_to_dict(resource.cdx) + if resource.revisit_cdx: + result['revisit_cdx'] = cdx_to_dict(resource.revisit_cdx) + + if ingest_type == "pdf": + if file_meta['mimetype'] != "application/pdf": + result['status'] = "wrong-mimetype" # formerly: "other-mimetype" + return result + elif ingest_type == "xml": + if file_meta['mimetype'] not in ("application/xml", "text/xml", "application/jats+xml"): + result['status'] = "wrong-mimetype" + return result + elif ingest_type == "html": + if file_meta['mimetype'] not in ("text/html", "application/xhtml+xml"): + result['status'] = "wrong-mimetype" + return result + else: + raise NotImplementedError() + + ### END COPYPASTA ### + + # determine platform + platform_helper = None + for (helper_name, helper) in self.dataset_platform_helpers.items(): + if helper.match_request(request, resource, html_biblio): + platform_helper = helper + break + + if not platform_helper: + result['status'] = 'no-platform-match' + return result + + # 2. Use platform-specific methods to fetch manifest metadata and decide on an `ingest_strategy`. + dataset_meta = platform_helper.process_request(request, resource.terminal_url, html_biblio) + platform = dataset_meta.platform_name + result['platform'] = dataset_meta.platform + result['platform_id'] = dataset_meta.platform_id + result['item_name'] = dataset_meta.item_name + if not dataset_meta.manifest: + result['status'] = 'no-manifest' + return result + + result['manifest'] = dataset_meta.manifest or None + result['file_count'] = len(dataset_meta.manifest) or None + result['total_size'] = sum([m.size for m in dataset_meta.manifest if m.size]) or None + + ingest_strategy = platform_helper.chose_strategy(dataset_meta) + result['ingest_strategy'] = ingest_strategy + + strategy_helper = self.dataset_strategy_archivers.get(ingest_strategy) + if not strategy_helper: + result['status'] = 'no-strategy-helper' + return result + + # 3. Use strategy-specific methods to archive all files in platform manifest, and verify manifest metadata. + archive_result = strategy_helper.process(dataset_meta) + + # 4. Summarize status and return structured result metadata. + result['status'] = archive_result.status + result['manifest'] = archive_result.manifest + result['file_count'] = len(archive_result.manifest) or None + result['total_size'] = sum([m.size for m in archive_result.manifest if m.size]) or None + + if result['status'] == 'success': + result['hit'] = True + print("[SUCCESS {:>5}] file_count={} total_size={}".format( + ingest_type, + result['file_count'], + result['total_size'], + ), file=sys.stderr) + else: + print("[FAIL {:>5}] status={} file_count={} total_size={}".format( + ingest_type, + result['status'], + result['file_count'], + result['total_size'], + ), file=sys.stderr) + return result |