diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-10-06 19:12:05 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-10-15 18:15:29 -0700 |
commit | 93adbbb98eff0c148bada84f794783b733c58f73 (patch) | |
tree | 0d6e121d7b84238734c5e45e90897c061066a7dd /python | |
parent | bff5da3971aa3ad458da048926da5c35252f1fb9 (diff) | |
download | sandcrawler-93adbbb98eff0c148bada84f794783b733c58f73.tar.gz sandcrawler-93adbbb98eff0c148bada84f794783b733c58f73.zip |
improvements to platform helpers
Diffstat (limited to 'python')
-rw-r--r-- | python/sandcrawler/fileset_platforms.py | 71 | ||||
-rw-r--r-- | python/sandcrawler/fileset_strategies.py | 5 | ||||
-rw-r--r-- | python/sandcrawler/ingest_fileset.py | 2 |
3 files changed, 44 insertions, 34 deletions
diff --git a/python/sandcrawler/fileset_platforms.py b/python/sandcrawler/fileset_platforms.py index 9232870..ac8a3af 100644 --- a/python/sandcrawler/fileset_platforms.py +++ b/python/sandcrawler/fileset_platforms.py @@ -62,11 +62,12 @@ class DataverseHelper(DatasetPlatformHelper): ] def match_request(self, request: dict , resource: Optional[ResourceResult], html_biblio: Optional[BiblioMetadata]) -> bool: - """ - XXX: should match process_request() logic better - """ + if resource and resource.terminal_url: + url = resource.terminal_url + else: + url = request['base_url'] - components = urllib.parse.urlparse(request['base_url']) + components = urllib.parse.urlparse(url) platform_domain = components.netloc.split(':')[0].lower() params = urllib.parse.parse_qs(components.query) platform_id = params.get('persistentId') @@ -86,10 +87,15 @@ class DataverseHelper(DatasetPlatformHelper): HTTP GET https://demo.dataverse.org/api/datasets/export?exporter=dataverse_json&persistentId=doi:10.5072/FK2/J8SJZB - """ + + if resource and resource.terminal_url: + url = resource.terminal_url + else: + url = request['base_url'] + # 1. extract domain, PID, and version from URL - components = urllib.parse.urlparse(request['base_url']) + components = urllib.parse.urlparse(url) platform_domain = components.netloc.split(':')[0].lower() params = urllib.parse.parse_qs(components.query) dataset_version = params.get('version') @@ -256,52 +262,37 @@ class ArchiveOrgHelper(DatasetPlatformHelper): return False return True - def parse_item_file(self, f: dict) -> FilesetManifestFile: - """ - Takes an IA API file and turns it in to a fatcat fileset manifest file - """ - assert f.name and f.sha1 and f.md5 - assert f.name is not None - mf = { - 'path': f.name, - 'size': int(f.size), - 'sha1': f.sha1, - 'md5': f.md5, - } - # TODO: will disable this hard check eventually and replace with: - #mimetype = FORMAT_TO_MIMETYPE.get(f.format) - mimetype = self.FORMAT_TO_MIMETYPE[f.format] - if mimetype: - mf['extra'] = dict(mimetype=mimetype) - return mf - - def match_request(self, request: dict , resource: Optional[ResourceResult], html_biblio: Optional[BiblioMetadata]) -> bool: + + if resource and resource.terminal_url: + url = resource.terminal_url + else: + url = request['base_url'] patterns = [ '://archive.org/details/', '://archive.org/download/', ] for p in patterns: - if p in request['base_url']: + if p in url: return True return False def process_request(self, request: dict, resource: Optional[ResourceResult], html_biblio: Optional[BiblioMetadata]) -> DatasetPlatformItem: """ Fetch platform-specific metadata for this request (eg, via API calls) - - XXX: add platform_url (for direct download) """ base_url_split = request['base_url'].split('/') #print(base_url_split, file=sys.stderr) - assert len(base_url_split) == 5 + assert len(base_url_split) in [5,6] assert base_url_split[0] in ['http:', 'https:'] assert base_url_split[2] == 'archive.org' assert base_url_split[3] in ['details', 'download'] item_name = base_url_split[4] + if len(base_url_split) == 6: + assert not base_url_split[5] - print(f" archiveorg processing item={item_name}", file=sys.stderr) + #print(f" archiveorg processing item={item_name}", file=sys.stderr) item = self.session.get_item(item_name) item_name = item.identifier item_collection = item.metadata['collection'] @@ -309,7 +300,20 @@ class ArchiveOrgHelper(DatasetPlatformHelper): item_collection = item_collection[0] assert item.metadata['mediatype'] not in ['collection', 'web'] item_files = item.get_files(on_the_fly=False) - manifest = [self.parse_item_file(f) for f in item_files if self.want_item_file(f, item_name)] + item_files = [f for f in item_files if self.want_item_file(f, item_name)] + manifest = [] + for f in item_files: + assert f.name and f.sha1 and f.md5 + assert f.name is not None + mf = FilesetManifestFile( + path=f.name, + size=int(f.size), + sha1=f.sha1, + md5=f.md5, + mimetype=self.FORMAT_TO_MIMETYPE[f.format], + platform_url=f"https://archive.org/download/{item_name}/{f.name}", + ) + manifest.append(mf) return DatasetPlatformItem( platform_name=self.platform_name, @@ -322,6 +326,9 @@ class ArchiveOrgHelper(DatasetPlatformHelper): ) def chose_strategy(self, item: DatasetPlatformItem) -> IngestStrategy: + """ + Don't use default strategy picker; we are always doing an 'existing' in this case. + """ if len(item.manifest) == 1: # NOTE: code flow does not support ArchiveorgFilesetBundle for the # case of, eg, a single zipfile in an archive.org item diff --git a/python/sandcrawler/fileset_strategies.py b/python/sandcrawler/fileset_strategies.py index 5ee4cc9..6bda9b4 100644 --- a/python/sandcrawler/fileset_strategies.py +++ b/python/sandcrawler/fileset_strategies.py @@ -82,6 +82,9 @@ class ArchiveorgFilesetStrategy(FilesetIngestStrategy): if existing: return existing + if item.platform_name == 'archiveorg': + raise ValueError("should't download archive.org into itself") + local_dir = self.working_dir + item.archiveorg_item_name assert local_dir.startswith('/') assert local_dir.count('/') > 2 @@ -193,7 +196,7 @@ class WebFilesetStrategy(FilesetIngestStrategy): self.try_spn2 = True self.spn_client = SavePageNowClient(spn_cdx_retry_sec=kwargs.get('spn_cdx_retry_sec', 9.0)) - # XXX: this is copypasta + # XXX: this is copypasta, and also should be part of SPN client, not here self.spn2_simple_get_domains = [ # direct PDF links "://arxiv.org/pdf/", diff --git a/python/sandcrawler/ingest_fileset.py b/python/sandcrawler/ingest_fileset.py index 3e782ed..f69fff4 100644 --- a/python/sandcrawler/ingest_fileset.py +++ b/python/sandcrawler/ingest_fileset.py @@ -263,7 +263,7 @@ class IngestFilesetWorker(IngestFileWorker): terminal_url = base_url if resource: terminal_url = resource.terminal_url - dataset_meta = platform_helper.process_request(request, terminal_url, html_biblio) + dataset_meta = platform_helper.process_request(request, resource, html_biblio) #print(dataset_meta, file=sys.stderr) platform = dataset_meta.platform_name result['platform'] = dataset_meta.platform_name |