From 93adbbb98eff0c148bada84f794783b733c58f73 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@archive.org>
Date: Wed, 6 Oct 2021 19:12:05 -0700
Subject: improvements to platform helpers

---
 python/sandcrawler/fileset_platforms.py  | 71 ++++++++++++++++++--------------
 python/sandcrawler/fileset_strategies.py |  5 ++-
 python/sandcrawler/ingest_fileset.py     |  2 +-
 3 files changed, 44 insertions(+), 34 deletions(-)

(limited to 'python')

diff --git a/python/sandcrawler/fileset_platforms.py b/python/sandcrawler/fileset_platforms.py
index 9232870..ac8a3af 100644
--- a/python/sandcrawler/fileset_platforms.py
+++ b/python/sandcrawler/fileset_platforms.py
@@ -62,11 +62,12 @@ class DataverseHelper(DatasetPlatformHelper):
         ]
 
     def match_request(self, request: dict , resource: Optional[ResourceResult], html_biblio: Optional[BiblioMetadata]) -> bool:
-        """
-        XXX: should match process_request() logic better
-        """
+        if resource and resource.terminal_url:
+            url = resource.terminal_url
+        else:
+            url = request['base_url']
 
-        components = urllib.parse.urlparse(request['base_url'])
+        components = urllib.parse.urlparse(url)
         platform_domain = components.netloc.split(':')[0].lower()
         params = urllib.parse.parse_qs(components.query)
         platform_id = params.get('persistentId')
@@ -86,10 +87,15 @@ class DataverseHelper(DatasetPlatformHelper):
 
 
         HTTP GET https://demo.dataverse.org/api/datasets/export?exporter=dataverse_json&persistentId=doi:10.5072/FK2/J8SJZB
-
         """
+
+        if resource and resource.terminal_url:
+            url = resource.terminal_url
+        else:
+            url = request['base_url']
+
         # 1. extract domain, PID, and version from URL
-        components = urllib.parse.urlparse(request['base_url'])
+        components = urllib.parse.urlparse(url)
         platform_domain = components.netloc.split(':')[0].lower()
         params = urllib.parse.parse_qs(components.query)
         dataset_version = params.get('version')
@@ -256,52 +262,37 @@ class ArchiveOrgHelper(DatasetPlatformHelper):
                     return False
         return True
 
-    def parse_item_file(self, f: dict) -> FilesetManifestFile:
-        """
-        Takes an IA API file and turns it in to a fatcat fileset manifest file
-        """
-        assert f.name and f.sha1 and f.md5
-        assert f.name is not None
-        mf = {
-            'path': f.name,
-            'size': int(f.size),
-            'sha1': f.sha1,
-            'md5': f.md5,
-        }
-        # TODO: will disable this hard check eventually and replace with:
-        #mimetype = FORMAT_TO_MIMETYPE.get(f.format)
-        mimetype = self.FORMAT_TO_MIMETYPE[f.format]
-        if mimetype:
-            mf['extra'] = dict(mimetype=mimetype)
-        return mf
-
-
     def match_request(self, request: dict , resource: Optional[ResourceResult], html_biblio: Optional[BiblioMetadata]) -> bool:
+
+        if resource and resource.terminal_url:
+            url = resource.terminal_url
+        else:
+            url = request['base_url']
         patterns = [
             '://archive.org/details/',
             '://archive.org/download/',
         ]
         for p in patterns:
-            if p in request['base_url']:
+            if p in url:
                 return True
         return False
 
     def process_request(self, request: dict, resource: Optional[ResourceResult], html_biblio: Optional[BiblioMetadata]) -> DatasetPlatformItem:
         """
         Fetch platform-specific metadata for this request (eg, via API calls)
-
-        XXX: add platform_url (for direct download)
         """
 
         base_url_split = request['base_url'].split('/')
         #print(base_url_split, file=sys.stderr)
-        assert len(base_url_split) == 5
+        assert len(base_url_split) in [5,6]
         assert base_url_split[0] in ['http:', 'https:']
         assert base_url_split[2] == 'archive.org'
         assert base_url_split[3] in ['details', 'download']
         item_name = base_url_split[4]
+        if len(base_url_split) == 6:
+            assert not base_url_split[5]
 
-        print(f"  archiveorg processing item={item_name}", file=sys.stderr)
+        #print(f"  archiveorg processing item={item_name}", file=sys.stderr)
         item = self.session.get_item(item_name)
         item_name = item.identifier
         item_collection = item.metadata['collection']
@@ -309,7 +300,20 @@ class ArchiveOrgHelper(DatasetPlatformHelper):
             item_collection = item_collection[0]
         assert item.metadata['mediatype'] not in ['collection', 'web']
         item_files = item.get_files(on_the_fly=False)
-        manifest = [self.parse_item_file(f) for f in item_files if self.want_item_file(f, item_name)]
+        item_files = [f for f in item_files if self.want_item_file(f, item_name)]
+        manifest = []
+        for f in item_files:
+            assert f.name and f.sha1 and f.md5
+            assert f.name is not None
+            mf = FilesetManifestFile(
+                path=f.name,
+                size=int(f.size),
+                sha1=f.sha1,
+                md5=f.md5,
+                mimetype=self.FORMAT_TO_MIMETYPE[f.format],
+                platform_url=f"https://archive.org/download/{item_name}/{f.name}",
+            )
+            manifest.append(mf)
 
         return DatasetPlatformItem(
             platform_name=self.platform_name,
@@ -322,6 +326,9 @@ class ArchiveOrgHelper(DatasetPlatformHelper):
         )
 
     def chose_strategy(self, item: DatasetPlatformItem) -> IngestStrategy:
+        """
+        Don't use default strategy picker; we are always doing an 'existing' in this case.
+        """
         if len(item.manifest) == 1:
             # NOTE: code flow does not support ArchiveorgFilesetBundle for the
             # case of, eg, a single zipfile in an archive.org item
diff --git a/python/sandcrawler/fileset_strategies.py b/python/sandcrawler/fileset_strategies.py
index 5ee4cc9..6bda9b4 100644
--- a/python/sandcrawler/fileset_strategies.py
+++ b/python/sandcrawler/fileset_strategies.py
@@ -82,6 +82,9 @@ class ArchiveorgFilesetStrategy(FilesetIngestStrategy):
         if existing:
             return existing
 
+        if item.platform_name == 'archiveorg':
+            raise ValueError("should't download archive.org into itself")
+
         local_dir = self.working_dir + item.archiveorg_item_name
         assert local_dir.startswith('/')
         assert local_dir.count('/') > 2
@@ -193,7 +196,7 @@ class WebFilesetStrategy(FilesetIngestStrategy):
         self.try_spn2 = True
         self.spn_client = SavePageNowClient(spn_cdx_retry_sec=kwargs.get('spn_cdx_retry_sec', 9.0))
 
-        # XXX: this is copypasta
+        # XXX: this is copypasta, and also should be part of SPN client, not here
         self.spn2_simple_get_domains = [
             # direct PDF links
             "://arxiv.org/pdf/",
diff --git a/python/sandcrawler/ingest_fileset.py b/python/sandcrawler/ingest_fileset.py
index 3e782ed..f69fff4 100644
--- a/python/sandcrawler/ingest_fileset.py
+++ b/python/sandcrawler/ingest_fileset.py
@@ -263,7 +263,7 @@ class IngestFilesetWorker(IngestFileWorker):
         terminal_url = base_url
         if resource:
             terminal_url = resource.terminal_url
-        dataset_meta = platform_helper.process_request(request, terminal_url, html_biblio)
+        dataset_meta = platform_helper.process_request(request, resource, html_biblio)
         #print(dataset_meta, file=sys.stderr)
         platform = dataset_meta.platform_name
         result['platform'] = dataset_meta.platform_name
-- 
cgit v1.2.3