aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-10-06 19:12:05 -0700
committerBryan Newbold <bnewbold@archive.org>2021-10-15 18:15:29 -0700
commit93adbbb98eff0c148bada84f794783b733c58f73 (patch)
tree0d6e121d7b84238734c5e45e90897c061066a7dd /python
parentbff5da3971aa3ad458da048926da5c35252f1fb9 (diff)
downloadsandcrawler-93adbbb98eff0c148bada84f794783b733c58f73.tar.gz
sandcrawler-93adbbb98eff0c148bada84f794783b733c58f73.zip
improvements to platform helpers
Diffstat (limited to 'python')
-rw-r--r--python/sandcrawler/fileset_platforms.py71
-rw-r--r--python/sandcrawler/fileset_strategies.py5
-rw-r--r--python/sandcrawler/ingest_fileset.py2
3 files changed, 44 insertions, 34 deletions
diff --git a/python/sandcrawler/fileset_platforms.py b/python/sandcrawler/fileset_platforms.py
index 9232870..ac8a3af 100644
--- a/python/sandcrawler/fileset_platforms.py
+++ b/python/sandcrawler/fileset_platforms.py
@@ -62,11 +62,12 @@ class DataverseHelper(DatasetPlatformHelper):
]
def match_request(self, request: dict , resource: Optional[ResourceResult], html_biblio: Optional[BiblioMetadata]) -> bool:
- """
- XXX: should match process_request() logic better
- """
+ if resource and resource.terminal_url:
+ url = resource.terminal_url
+ else:
+ url = request['base_url']
- components = urllib.parse.urlparse(request['base_url'])
+ components = urllib.parse.urlparse(url)
platform_domain = components.netloc.split(':')[0].lower()
params = urllib.parse.parse_qs(components.query)
platform_id = params.get('persistentId')
@@ -86,10 +87,15 @@ class DataverseHelper(DatasetPlatformHelper):
HTTP GET https://demo.dataverse.org/api/datasets/export?exporter=dataverse_json&persistentId=doi:10.5072/FK2/J8SJZB
-
"""
+
+ if resource and resource.terminal_url:
+ url = resource.terminal_url
+ else:
+ url = request['base_url']
+
# 1. extract domain, PID, and version from URL
- components = urllib.parse.urlparse(request['base_url'])
+ components = urllib.parse.urlparse(url)
platform_domain = components.netloc.split(':')[0].lower()
params = urllib.parse.parse_qs(components.query)
dataset_version = params.get('version')
@@ -256,52 +262,37 @@ class ArchiveOrgHelper(DatasetPlatformHelper):
return False
return True
- def parse_item_file(self, f: dict) -> FilesetManifestFile:
- """
- Takes an IA API file and turns it in to a fatcat fileset manifest file
- """
- assert f.name and f.sha1 and f.md5
- assert f.name is not None
- mf = {
- 'path': f.name,
- 'size': int(f.size),
- 'sha1': f.sha1,
- 'md5': f.md5,
- }
- # TODO: will disable this hard check eventually and replace with:
- #mimetype = FORMAT_TO_MIMETYPE.get(f.format)
- mimetype = self.FORMAT_TO_MIMETYPE[f.format]
- if mimetype:
- mf['extra'] = dict(mimetype=mimetype)
- return mf
-
-
def match_request(self, request: dict , resource: Optional[ResourceResult], html_biblio: Optional[BiblioMetadata]) -> bool:
+
+ if resource and resource.terminal_url:
+ url = resource.terminal_url
+ else:
+ url = request['base_url']
patterns = [
'://archive.org/details/',
'://archive.org/download/',
]
for p in patterns:
- if p in request['base_url']:
+ if p in url:
return True
return False
def process_request(self, request: dict, resource: Optional[ResourceResult], html_biblio: Optional[BiblioMetadata]) -> DatasetPlatformItem:
"""
Fetch platform-specific metadata for this request (eg, via API calls)
-
- XXX: add platform_url (for direct download)
"""
base_url_split = request['base_url'].split('/')
#print(base_url_split, file=sys.stderr)
- assert len(base_url_split) == 5
+ assert len(base_url_split) in [5,6]
assert base_url_split[0] in ['http:', 'https:']
assert base_url_split[2] == 'archive.org'
assert base_url_split[3] in ['details', 'download']
item_name = base_url_split[4]
+ if len(base_url_split) == 6:
+ assert not base_url_split[5]
- print(f" archiveorg processing item={item_name}", file=sys.stderr)
+ #print(f" archiveorg processing item={item_name}", file=sys.stderr)
item = self.session.get_item(item_name)
item_name = item.identifier
item_collection = item.metadata['collection']
@@ -309,7 +300,20 @@ class ArchiveOrgHelper(DatasetPlatformHelper):
item_collection = item_collection[0]
assert item.metadata['mediatype'] not in ['collection', 'web']
item_files = item.get_files(on_the_fly=False)
- manifest = [self.parse_item_file(f) for f in item_files if self.want_item_file(f, item_name)]
+ item_files = [f for f in item_files if self.want_item_file(f, item_name)]
+ manifest = []
+ for f in item_files:
+ assert f.name and f.sha1 and f.md5
+ assert f.name is not None
+ mf = FilesetManifestFile(
+ path=f.name,
+ size=int(f.size),
+ sha1=f.sha1,
+ md5=f.md5,
+ mimetype=self.FORMAT_TO_MIMETYPE[f.format],
+ platform_url=f"https://archive.org/download/{item_name}/{f.name}",
+ )
+ manifest.append(mf)
return DatasetPlatformItem(
platform_name=self.platform_name,
@@ -322,6 +326,9 @@ class ArchiveOrgHelper(DatasetPlatformHelper):
)
def chose_strategy(self, item: DatasetPlatformItem) -> IngestStrategy:
+ """
+ Don't use default strategy picker; we are always doing an 'existing' in this case.
+ """
if len(item.manifest) == 1:
# NOTE: code flow does not support ArchiveorgFilesetBundle for the
# case of, eg, a single zipfile in an archive.org item
diff --git a/python/sandcrawler/fileset_strategies.py b/python/sandcrawler/fileset_strategies.py
index 5ee4cc9..6bda9b4 100644
--- a/python/sandcrawler/fileset_strategies.py
+++ b/python/sandcrawler/fileset_strategies.py
@@ -82,6 +82,9 @@ class ArchiveorgFilesetStrategy(FilesetIngestStrategy):
if existing:
return existing
+ if item.platform_name == 'archiveorg':
+ raise ValueError("should't download archive.org into itself")
+
local_dir = self.working_dir + item.archiveorg_item_name
assert local_dir.startswith('/')
assert local_dir.count('/') > 2
@@ -193,7 +196,7 @@ class WebFilesetStrategy(FilesetIngestStrategy):
self.try_spn2 = True
self.spn_client = SavePageNowClient(spn_cdx_retry_sec=kwargs.get('spn_cdx_retry_sec', 9.0))
- # XXX: this is copypasta
+ # XXX: this is copypasta, and also should be part of SPN client, not here
self.spn2_simple_get_domains = [
# direct PDF links
"://arxiv.org/pdf/",
diff --git a/python/sandcrawler/ingest_fileset.py b/python/sandcrawler/ingest_fileset.py
index 3e782ed..f69fff4 100644
--- a/python/sandcrawler/ingest_fileset.py
+++ b/python/sandcrawler/ingest_fileset.py
@@ -263,7 +263,7 @@ class IngestFilesetWorker(IngestFileWorker):
terminal_url = base_url
if resource:
terminal_url = resource.terminal_url
- dataset_meta = platform_helper.process_request(request, terminal_url, html_biblio)
+ dataset_meta = platform_helper.process_request(request, resource, html_biblio)
#print(dataset_meta, file=sys.stderr)
platform = dataset_meta.platform_name
result['platform'] = dataset_meta.platform_name