progress on fileset/dataset ingest

author: Bryan Newbold <bnewbold@archive.org> 2021-10-04 13:01:58 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2021-10-15 18:15:20 -0700
commit: e2e0602114ccdf142b3ef0f30c67d2cb7a58ef7e (patch)
tree: bee2b4343f30d59fec463dbdcaafafc11d7cd513
parent: 452475df7619f3743eac5ad86e2e1fb8ba9972da (diff)
download: sandcrawler-e2e0602114ccdf142b3ef0f30c67d2cb7a58ef7e.tar.gz
sandcrawler-e2e0602114ccdf142b3ef0f30c67d2cb7a58ef7e.zip
4 files changed, 403 insertions, 0 deletions
diff --git a/python/sandcrawler/fileset_platforms.py b/python/sandcrawler/fileset_platforms.py
new file mode 100644
index 0000000..7aeacf2
--- /dev/null
+++ b/python/sandcrawler/fileset_platforms.py
@@ -0,0 +1,39 @@
+
+import sys
+import json
+import gzip
+import time
+from collections import namedtuple
+from typing import Optional, Tuple, Any, Dict, List
+
+from sandcrawler.html_metadata import BiblioMetadata
+from sandcrawler.ia import ResourceResult
+
+
+class DatasetPlatformHelper(class):
+
+    def __init__():
+        self.platform_name = 'unknown'
+
+    def match_request(request: dict , resource: ResourceResult, html_biblio: Optional[BiblioMetadata]) -> bool:
+        """
+        Does this request look like it matches this platform?
+        """
+        raise NotImplemented
+
+    def get_item(request: dict, resource: ResourceResult, html_biblio: Optional[BiblioMetadata]) -> DatasetPlatformItem:
+        """
+        Fetch platform-specific metadata for this request (eg, via API calls)
+        """
+        raise NotImplemented
+
+
+class DataverseHelper(DatasetPlatformHelper):
+
+    def __init__():
+        self.platform_name = 'dataverse'
+
+class ArchiveOrgHelper(DatasetPlatformHelper):
+
+    def __init__():
+        self.platform_name = 'archiveorg'
diff --git a/python/sandcrawler/fileset_strategies.py b/python/sandcrawler/fileset_strategies.py
new file mode 100644
index 0000000..592b475
--- /dev/null
+++ b/python/sandcrawler/fileset_strategies.py
@@ -0,0 +1,22 @@
+
+import sys
+import json
+import gzip
+import time
+from collections import namedtuple
+from typing import Optional, Tuple, Any, Dict, List
+
+from sandcrawler.html_metadata import BiblioMetadata
+from sandcrawler.ia import ResourceResult
+from sandcrawler.fileset_types import IngestStrategy, FilesetManifestFile, DatasetPlatformItem
+
+
+class FilesetIngestStrategy(class):
+
+    def __init__():
+        self.ingest_strategy = 'unknown'
+
+    def check_existing(): # XXX: -> Any:
+        raise NotImplementedError()
+
+    def process(item: DatasetPlatformItem):
diff --git a/python/sandcrawler/fileset_types.py b/python/sandcrawler/fileset_types.py
new file mode 100644
index 0000000..f0f03db
--- /dev/null
+++ b/python/sandcrawler/fileset_types.py
@@ -0,0 +1,43 @@
+
+from enum import Enum
+
+from pydantic import BaseModel
+
+class IngestStrategy(str, Enum):
+    WebFile = "web-file"
+    WebFileset = "web-fileset"
+    WebFilesetBundled = "web-fileset-bundled"
+    ArchiveorgFile = "archiveorg-file"
+    ArchiveorgFileset = "archiveorg-fileset"
+    ArchiveorgFilesetBundled = "archiveorg-fileset-bundled"
+
+class FilesetManifestFile(BaseModel):
+    path: str
+    size: Optional[int]
+    md5: Optional[str]
+    sha1: Optional[str]
+    sha256: Optional[str]
+    mimetype: Optional[str]
+
+    status: Optional[str]
+    platform_url: Optional[str]
+    terminal_url: Optional[str]
+    terminal_dt: Optional[str]
+    extra: Optional[Dict[str, Any]]
+
+class DatasetPlatformItem(BaseModel):
+    platform_name: str
+    platform_status: str
+    manifest: Optional[List[FilesetManifestFile]]
+
+    platform_domain: Optional[str]
+    platform_id: Optional[str]
+    archiveorg_item_name: Optional[str]
+    archiveorg_collection: Optional[str]
+    web_base_url: Optional[str]
+    web_bundle_url: Optional[str]
+
+class ArchiveStrategyResult(BaseModel):
+    ingest_strategy: str
+    status: str
+    manifest: List[FilesetManifestFile]
diff --git a/python/sandcrawler/ingest_fileset.py b/python/sandcrawler/ingest_fileset.py
new file mode 100644
index 0000000..9ffaa47
--- /dev/null
+++ b/python/sandcrawler/ingest_fileset.py
@@ -0,0 +1,299 @@
+
+import sys
+import json
+import gzip
+import time
+from collections import namedtuple
+from typing import Optional, Tuple, Any, Dict, List
+
+import requests
+from selectolax.parser import HTMLParser
+
+from sandcrawler.ia import SavePageNowClient, CdxApiClient, WaybackClient, WaybackError, WaybackContentError, SavePageNowError, CdxApiError, PetaboxError, cdx_to_dict, ResourceResult, fix_transfer_encoding, NoCaptureError
+from sandcrawler.misc import gen_file_metadata, clean_url, parse_cdx_datetime
+from sandcrawler.html import extract_fulltext_url
+from sandcrawler.html_ingest import fetch_html_resources, \
+    quick_fetch_html_resources, html_guess_scope, html_extract_body_teixml, \
+    WebResource, html_guess_platform
+
+from sandcrawler.html_metadata import BiblioMetadata, html_extract_resources, html_extract_biblio, load_adblock_rules
+from sandcrawler.workers import SandcrawlerWorker
+from sandcrawler.db import SandcrawlerPostgrestClient
+
+
+MAX_BODY_SIZE_BYTES = 128*1024*1024
+
+class IngestFilesetWorker(IngestFileWorker):
+    """
+    General process is:
+
+    1. crawl base_url, and use request and landing page resource (eg, HTML) to
+       determine platform being targeted
+    2. use platform-specific helper to fetch metadata about the work, including
+       a manifest of files, and selection of an "ingest strategy" and any
+       required context
+    3. then use strategy-specific helper to archive files from manifest (first
+       checking to see if content has been archived already)
+    4. summarize status
+    """
+
+    def __init__(self, sink=None, **kwargs):
+        super().__init__(sink=None, **kwargs)
+
+        self.sink = sink
+
+
+    def check_existing_ingest(self, ingest_type: str, base_url: str) -> Optional[dict]:
+        """
+        Same as file version, but uses fileset result table
+        """
+        if not self.try_existing_ingest:
+            return None
+        existing = self.pgrest_client.get_ingest_fileset_result(ingest_type, base_url)
+        # TODO: filter on more flags?
+        if existing and existing['hit'] == True:
+            return existing
+        else:
+            return None
+
+    def process_existing(self, request: dict, result_row: dict) -> dict:
+        """
+        If we have an existing ingest fileset result, do any database fetches
+        or additional processing necessary to return a result.
+        """
+        raise NotImplementedError("process_existing() not tested or safe yet")
+
+    # XXX: use file version
+    #def process_hit(self, ingest_type: str, resource: ResourceResult, file_meta: dict) -> dict:
+
+    def want(self, request: dict) -> bool:
+        if not request.get('ingest_type') in ('dataset',):
+            return False
+        return True
+
+    def process(self, request: dict, key: Any = None) -> dict:
+
+        ingest_type = request.get('ingest_type')
+        if ingest_type not in ("dataset",):
+            raise NotImplementedError(f"can't handle ingest_type={ingest_type}")
+
+        # parse/clean URL
+        # note that we pass through the original/raw URL, and that is what gets
+        # persisted in database table
+        base_url = clean_url(request['base_url'])
+
+        force_recrawl = bool(request.get('force_recrawl', False))
+
+        for block in self.base_url_blocklist:
+            if block in base_url:
+                print("[SKIP {:>6}] {}".format(ingest_type, base_url), file=sys.stderr)
+                return dict(request=request, hit=False, status="skip-url-blocklist")
+
+        print("[INGEST {:>6}] {}".format(ingest_type, base_url), file=sys.stderr)
+
+        # TODO
+        #existing = self.check_existing_ingest(ingest_type, base_url)
+        #if existing:
+        #    return self.process_existing(request, existing)
+
+        result: Dict[str, Any] = dict(request=request, hit=False)
+        hops = [base_url]
+
+        # 1. Determine `platform`, which may involve resolving redirects and crawling a landing page.
+
+        ### START COPYPASTA from process_file(), should refactor ###
+
+        # check against blocklist
+        for block in self.base_url_blocklist:
+            if block in next_url:
+                result['status'] = "skip-url-blocklist"
+                return result
+
+        try:
+            resource = self.find_resource(next_url, best_mimetype, force_recrawl=force_recrawl)
+        except SavePageNowError as e:
+            result['status'] = 'spn2-error'
+            result['error_message'] = str(e)[:1600]
+            return result
+        except PetaboxError as e:
+            result['status'] = 'petabox-error'
+            result['error_message'] = str(e)[:1600]
+            return result
+        except CdxApiError as e:
+            result['status'] = 'cdx-error'
+            result['error_message'] = str(e)[:1600]
+            # add a sleep in cdx-error path as a slow-down
+            time.sleep(2.0)
+            return result
+        except WaybackError as e:
+            result['status'] = 'wayback-error'
+            result['error_message'] = str(e)[:1600]
+            return result
+        except WaybackContentError as e:
+            result['status'] = 'wayback-content-error'
+            result['error_message'] = str(e)[:1600]
+            return result
+        except NotImplementedError as e:
+            result['status'] = 'not-implemented'
+            result['error_message'] = str(e)[:1600]
+            return result
+
+        assert resource
+
+        if resource.terminal_url:
+            result['terminal'] = {
+                "terminal_url": resource.terminal_url,
+                "terminal_dt": resource.terminal_dt,
+                "terminal_status_code": resource.terminal_status_code,
+            }
+            if resource.terminal_url not in result['hops']:
+                result['hops'].append(resource.terminal_url)
+
+        if not resource.hit:
+            result['status'] = resource.status
+            return result
+
+        if resource.terminal_url:
+            for pattern in self.base_url_blocklist:
+                if pattern in resource.terminal_url:
+                    result['status'] = 'skip-url-blocklist'
+                    return result
+
+        if resource.terminal_url:
+            for pattern in self.cookie_blocklist:
+                if pattern in resource.terminal_url:
+                    result['status'] = 'blocked-cookie'
+                    return result
+
+        if not resource.body:
+            result['status'] = 'null-body'
+            return result
+
+        if len(resource.body) > MAX_BODY_SIZE_BYTES:
+            result['status'] = 'body-too-large'
+            return result
+
+        file_meta = gen_file_metadata(resource.body)
+        try:
+            file_meta, resource = fix_transfer_encoding(file_meta, resource)
+        except Exception as e:
+            result['status'] = 'bad-gzip-encoding'
+            result['error_message'] = str(e)
+            return result
+
+        if not resource.body or file_meta['size_bytes'] == 0:
+            result['status'] = 'null-body'
+            return result
+
+        # here we split based on ingest type to try and extract a next hop
+        html_ish_resource = bool(
+            "html" in file_meta['mimetype']
+            or "xhtml" in file_meta['mimetype'] # matches "application/xhtml+xml"
+            or "application/xml" in file_meta['mimetype']
+            or "text/xml" in file_meta['mimetype']
+        )
+        html_biblio = None
+        html_doc = None
+        if html_ish_resource and resource.body:
+            try:
+                html_doc = HTMLParser(resource.body)
+                html_biblio = html_extract_biblio(resource.terminal_url, html_doc)
+                if html_biblio:
+                    if not 'html_biblio' in result or html_biblio.title:
+                        result['html_biblio'] = json.loads(html_biblio.json(exclude_none=True))
+                        #print(f"  setting html_biblio: {result['html_biblio']}", file=sys.stderr)
+            except ValueError:
+                pass
+
+        # fetch must be a hit if we got this far (though not necessarily an ingest hit!)
+        assert resource
+        assert resource.hit == True
+        assert resource.terminal_status_code in (200, 226)
+
+        if resource.terminal_url:
+            result['terminal'] = {
+                "terminal_url": resource.terminal_url,
+                "terminal_dt": resource.terminal_dt,
+                "terminal_status_code": resource.terminal_status_code,
+                "terminal_sha1hex": file_meta['sha1hex'],
+            }
+
+        result['file_meta'] = file_meta
+        result['cdx'] = cdx_to_dict(resource.cdx)
+        if resource.revisit_cdx:
+            result['revisit_cdx'] = cdx_to_dict(resource.revisit_cdx)
+
+        if ingest_type == "pdf":
+            if file_meta['mimetype'] != "application/pdf":
+                result['status'] = "wrong-mimetype"  # formerly: "other-mimetype"
+                return result
+        elif ingest_type == "xml":
+            if file_meta['mimetype'] not in ("application/xml", "text/xml", "application/jats+xml"):
+                result['status'] = "wrong-mimetype"
+                return result
+        elif ingest_type == "html":
+            if file_meta['mimetype'] not in ("text/html", "application/xhtml+xml"):
+                result['status'] = "wrong-mimetype"
+                return result
+        else:
+            raise NotImplementedError()
+
+        ### END COPYPASTA ###
+
+        # determine platform
+        platform_helper = None
+        for (helper_name, helper) in self.dataset_platform_helpers.items():
+            if helper.match_request(request, resource, html_biblio):
+                platform_helper = helper
+                break
+
+        if not platform_helper:
+            result['status'] = 'no-platform-match'
+            return result
+
+        # 2. Use platform-specific methods to fetch manifest metadata and decide on an `ingest_strategy`.
+        dataset_meta = platform_helper.process_request(request, resource.terminal_url, html_biblio)
+        platform = dataset_meta.platform_name
+        result['platform'] = dataset_meta.platform
+        result['platform_id'] = dataset_meta.platform_id
+        result['item_name'] = dataset_meta.item_name
+        if not dataset_meta.manifest:
+            result['status'] = 'no-manifest'
+            return result
+
+        result['manifest'] = dataset_meta.manifest or None
+        result['file_count'] = len(dataset_meta.manifest) or None
+        result['total_size'] = sum([m.size for m in dataset_meta.manifest if m.size]) or None
+
+        ingest_strategy = platform_helper.chose_strategy(dataset_meta)
+        result['ingest_strategy'] = ingest_strategy
+
+        strategy_helper = self.dataset_strategy_archivers.get(ingest_strategy)
+        if not strategy_helper:
+            result['status'] = 'no-strategy-helper'
+            return result
+
+        # 3. Use strategy-specific methods to archive all files in platform manifest, and verify manifest metadata.
+        archive_result = strategy_helper.process(dataset_meta)
+
+        # 4. Summarize status and return structured result metadata.
+        result['status'] = archive_result.status
+        result['manifest'] = archive_result.manifest
+        result['file_count'] = len(archive_result.manifest) or None
+        result['total_size'] = sum([m.size for m in archive_result.manifest if m.size]) or None
+
+        if result['status'] == 'success':
+            result['hit'] = True
+            print("[SUCCESS {:>5}] file_count={} total_size={}".format(
+                    ingest_type,
+                    result['file_count'],
+                    result['total_size'],
+                ), file=sys.stderr)
+        else:
+            print("[FAIL    {:>5}] status={} file_count={} total_size={}".format(
+                    ingest_type,
+                    result['status'],
+                    result['file_count'],
+                    result['total_size'],
+                ), file=sys.stderr)
+        return result
author	Bryan Newbold <bnewbold@archive.org>	2021-10-04 13:01:58 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2021-10-15 18:15:20 -0700
commit	e2e0602114ccdf142b3ef0f30c67d2cb7a58ef7e (patch)
tree	bee2b4343f30d59fec463dbdcaafafc11d7cd513
parent	452475df7619f3743eac5ad86e2e1fb8ba9972da (diff)
download	sandcrawler-e2e0602114ccdf142b3ef0f30c67d2cb7a58ef7e.tar.gz sandcrawler-e2e0602114ccdf142b3ef0f30c67d2cb7a58ef7e.zip