aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/fileset_strategies.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-10-06 18:02:41 -0700
committerBryan Newbold <bnewbold@archive.org>2021-10-15 18:15:29 -0700
commitf98f6226097ac34cf8a57ee09a4feea9171addfe (patch)
tree395922d7aabe0dcbed322b4955697bdd2fd67631 /python/sandcrawler/fileset_strategies.py
parent07e8a199766be77f4e89561d03e9b4e995ab7396 (diff)
downloadsandcrawler-f98f6226097ac34cf8a57ee09a4feea9171addfe.tar.gz
sandcrawler-f98f6226097ac34cf8a57ee09a4feea9171addfe.zip
progress on web ingest strategy
Diffstat (limited to 'python/sandcrawler/fileset_strategies.py')
-rw-r--r--python/sandcrawler/fileset_strategies.py103
1 files changed, 102 insertions, 1 deletions
diff --git a/python/sandcrawler/fileset_strategies.py b/python/sandcrawler/fileset_strategies.py
index c335ea6..5ee4cc9 100644
--- a/python/sandcrawler/fileset_strategies.py
+++ b/python/sandcrawler/fileset_strategies.py
@@ -11,7 +11,7 @@ from typing import Optional, Tuple, Any, Dict, List
import internetarchive
from sandcrawler.html_metadata import BiblioMetadata
-from sandcrawler.ia import ResourceResult
+from sandcrawler.ia import ResourceResult, WaybackClient, SavePageNowClient, fix_transfer_encoding
from sandcrawler.fileset_types import IngestStrategy, FilesetManifestFile, DatasetPlatformItem, ArchiveStrategyResult
from sandcrawler.misc import gen_file_metadata, gen_file_metadata_path
@@ -185,8 +185,109 @@ class ArchiveorgFileStrategy(ArchiveorgFilesetStrategy):
super().__init__()
self.ingest_strategy = IngestStrategy.ArchiveorgFileset
+class WebFilesetStrategy(FilesetIngestStrategy):
+
+ def __init__(self, **kwargs):
+ self.ingest_strategy = IngestStrategy.WebFileset
+ self.wayback_client = WaybackClient()
+ self.try_spn2 = True
+ self.spn_client = SavePageNowClient(spn_cdx_retry_sec=kwargs.get('spn_cdx_retry_sec', 9.0))
+
+ # XXX: this is copypasta
+ self.spn2_simple_get_domains = [
+ # direct PDF links
+ "://arxiv.org/pdf/",
+ "://europepmc.org/backend/ptpmcrender.fcgi",
+ "://pdfs.semanticscholar.org/",
+ "://res.mdpi.com/",
+
+ # platform sites
+ "://zenodo.org/",
+ "://figshare.org/",
+ "://springernature.figshare.com/",
+
+ # popular simple cloud storage or direct links
+ "://s3-eu-west-1.amazonaws.com/",
+ ]
+
+ def process(self, item: DatasetPlatformItem) -> ArchiveStrategyResult:
+ """
+ For each manifest item individually, run 'fetch_resource' and record stats, terminal_url, terminal_dt
+
+ TODO:
+ - full fetch_resource() method which can do SPN requests
+ """
+
+ for m in item.manifest:
+ fetch_url = m.platform_url
+ if not fetch_url:
+ raise NotImplementedError("require 'platform_url' for each file when doing Web fetching")
+
+ via = "wayback"
+ resource = self.wayback_client.lookup_resource(fetch_url, m.mimetype)
+
+
+ if self.try_spn2 and (resource == None or (resource and resource.status == 'no-capture')):
+ via = "spn2"
+ force_simple_get = 0
+ for domain in self.spn2_simple_get_domains:
+ if domain in fetch_url:
+ force_simple_get = 1
+ break
+ resource = self.spn_client.crawl_resource(fetch_url, self.wayback_client, force_simple_get=force_simple_get)
+
+ print("[FETCH {:>6}] {} {}".format(
+ via,
+ (resource and resource.status),
+ (resource and resource.terminal_url) or url),
+ file=sys.stderr)
+
+ m.terminal_url = resource.terminal_url
+ m.terminal_dt = resource.terminal_dt
+ m.status = resource.status
+
+ if resource.status != 'success':
+ continue
+ else:
+ assert resource.terminal_status_code == 200
+
+ file_meta = gen_file_metadata(resource.body)
+ file_meta, html_resource = fix_transfer_encoding(file_meta, resource)
+
+ if file_meta['size_bytes'] != m.size or (m.md5 and m.md5 != file_meta['md5hex']) or (m.sha1 and m.sha1 != file_meta['sha1hex']):
+ m.status = 'mismatch'
+ continue
+
+ m.md5 = m.md5 or file_meta['md5hex']
+ m.sha1 = m.sha1 or file_meta['md5hex']
+ m.sha256 = m.sha256 or file_meta['sha256hex']
+ m.mimetype = m.mimetype or file_meta['mimetype']
+
+ overall_status = "success"
+ for m in item.manifest:
+ if m.status != 'success':
+ overall_status = m.status
+ break
+ if not item.manifest:
+ overall_status = 'empty-manifest'
+
+ result = ArchiveStrategyResult(
+ ingest_strategy=self.ingest_strategy,
+ status=overall_status,
+ manifest=item.manifest,
+ )
+ return result
+
+class WebFileStrategy(WebFilesetStrategy):
+
+ def __init__(self, **kwargs):
+ super().__init__(**kwargs)
+ self.ingest_strategy = IngestStrategy.WebFile
+
FILESET_STRATEGY_HELPER_TABLE = {
IngestStrategy.ArchiveorgFileset: ArchiveorgFilesetStrategy(),
IngestStrategy.ArchiveorgFile: ArchiveorgFileStrategy(),
+ IngestStrategy.WebFileset: WebFilesetStrategy(),
+ IngestStrategy.WebFile: WebFileStrategy(),
}