progress on dataset ingest

author: Bryan Newbold <bnewbold@archive.org> 2021-10-04 16:13:55 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2021-10-15 18:15:25 -0700
commit: b3447503c0aa2e326ce1e46c993be28f907ec23b (patch)
tree: 4bd3de9016ecc95e38de5c75e6fd69b5ce26f74c /python/sandcrawler/fileset_platforms.py
parent: 147319ae00a6b788104209083f65cbaa4329c862 (diff)
download: sandcrawler-b3447503c0aa2e326ce1e46c993be28f907ec23b.tar.gz
sandcrawler-b3447503c0aa2e326ce1e46c993be28f907ec23b.zip
1 files changed, 163 insertions, 8 deletions
diff --git a/python/sandcrawler/fileset_platforms.py b/python/sandcrawler/fileset_platforms.py
index 7aeacf2..5342a4e 100644
--- a/python/sandcrawler/fileset_platforms.py
+++ b/python/sandcrawler/fileset_platforms.py
@@ -6,34 +6,189 @@ import time
 from collections import namedtuple
 from typing import Optional, Tuple, Any, Dict, List
 
+import internetarchive
+
 from sandcrawler.html_metadata import BiblioMetadata
 from sandcrawler.ia import ResourceResult
+from sandcrawler.fileset_types import *
 
 
-class DatasetPlatformHelper(class):
+class DatasetPlatformHelper():
 
-    def __init__():
+    def __init__(self):
         self.platform_name = 'unknown'
 
-    def match_request(request: dict , resource: ResourceResult, html_biblio: Optional[BiblioMetadata]) -> bool:
+    def match_request(self, request: dict , resource: Optional[ResourceResult], html_biblio: Optional[BiblioMetadata]) -> bool:
         """
         Does this request look like it matches this platform?
         """
-        raise NotImplemented
+        raise NotImplementedError()
 
-    def get_item(request: dict, resource: ResourceResult, html_biblio: Optional[BiblioMetadata]) -> DatasetPlatformItem:
+    def process_request(self, request: dict, resource: Optional[ResourceResult], html_biblio: Optional[BiblioMetadata]) -> DatasetPlatformItem:
         """
         Fetch platform-specific metadata for this request (eg, via API calls)
         """
-        raise NotImplemented
+        raise NotImplementedError()
+
+    def chose_strategy(self, DatasetPlatformItem) -> IngestStrategy:
+        raise NotImplementedError()
 
 
 class DataverseHelper(DatasetPlatformHelper):
 
-    def __init__():
+    def __init__(self):
         self.platform_name = 'dataverse'
 
+    def match_request(self, request: dict , resource: Optional[ResourceResult], html_biblio: Optional[BiblioMetadata]) -> bool:
+        return False
+
+    def chose_strategy(self, DatasetPlatformItem) -> IngestStrategy:
+        raise NotImplementedError()
+
+
 class ArchiveOrgHelper(DatasetPlatformHelper):
 
-    def __init__():
+    FORMAT_TO_MIMETYPE = {
+        'BZIP': 'application/x-bzip',
+        'BZIP2': 'application/x-bzip2',
+        'ZIP': 'application/zip',
+        'GZIP': 'application/gzip',
+        'RAR': 'application/vnd.rar',
+        'TAR': 'application/x-tar',
+        '7z': 'application/x-7z-compressed',
+
+        'HTML': 'text/html',
+        'Text': 'text/plain',
+        'PDF': 'application/pdf',
+
+        'CSV': 'text/csv',
+        'XML': 'application/xml',
+        'JSON': 'application/json',
+
+        #'application/msword (.doc)', # .doc
+        #'application/vnd.openxmlformats-officedocument.wordprocessingml.document', # .docx
+        #'application/vnd.ms-excel', # .xls
+        #'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', # .xlsx
+
+        'MP3': 'audio/mpeg', # .mp3
+
+        'MP4': 'video/mp4', # .mp4
+        'MPEG': 'video/mpeg', # .mpeg
+
+        'JPEG': 'image/jpeg',
+        'GIF': 'image/gif',
+        'PNG': 'image/png',
+        'TIFF': 'image/tiff',
+
+        'Unknown': None,
+    }
+
+    def __init__(self):
         self.platform_name = 'archiveorg'
+        self.session = internetarchive.get_session()
+
+    @staticmethod
+    def want_item_file(f: dict, item_name: str) -> bool:
+        """
+        Filters IA API files
+        """
+        if f.source != 'original':
+            return False
+        for suffix in [
+            '_meta.sqlite',
+            '_archive.torrent',
+            '_itemimage.jpg',
+            '_meta.xml',
+            '_thumb.png',
+            '_files.xml',
+        ]:
+            if f.name == item_name + suffix or f.name == item_name.lower() + suffix:
+                return False
+        if f.name.startswith('_'):
+            return False
+        if item_name.startswith('academictorrents_'):
+            for suffix in ['_academictorrents.torrent', '_academictorrents_torrent.txt', '.bib']:
+                if f.name == item_name + suffix:
+                    return False
+        return True
+
+    def parse_item_file(self, f: dict) -> FilesetManifestFile:
+        """
+        Takes an IA API file and turns it in to a fatcat fileset manifest file
+        """
+        assert f.name and f.sha1 and f.md5
+        assert f.name is not None
+        mf = {
+            'path': f.name,
+            'size': int(f.size),
+            'sha1': f.sha1,
+            'md5': f.md5,
+        }
+        # TODO: will disable this hard check eventually and replace with:
+        #mimetype = FORMAT_TO_MIMETYPE.get(f.format)
+        mimetype = self.FORMAT_TO_MIMETYPE[f.format]
+        if mimetype:
+            mf['extra'] = dict(mimetype=mimetype)
+        return mf
+
+
+    def match_request(self, request: dict , resource: Optional[ResourceResult], html_biblio: Optional[BiblioMetadata]) -> bool:
+        patterns = [
+            '://archive.org/details/',
+            '://archive.org/download/',
+        ]
+        for p in patterns:
+            if p in request['base_url']:
+                return True
+        return False
+
+    def process_request(self, request: dict, resource: Optional[ResourceResult], html_biblio: Optional[BiblioMetadata]) -> DatasetPlatformItem:
+        """
+        Fetch platform-specific metadata for this request (eg, via API calls)
+
+        XXX: add platform_url (for direct download)
+        """
+
+        base_url_split = request['base_url'].split('/')
+        #print(base_url_split, file=sys.stderr)
+        assert len(base_url_split) == 5
+        assert base_url_split[0] in ['http:', 'https:']
+        assert base_url_split[2] == 'archive.org'
+        assert base_url_split[3] in ['details', 'download']
+        item_name = base_url_split[4]
+
+        print(f"  archiveorg processing item={item_name}", file=sys.stderr)
+        item = self.session.get_item(item_name)
+        item_name = item.identifier
+        item_collection = item.metadata['collection']
+        if type(item_collection) == list:
+            item_collection = item_collection[0]
+        assert item.metadata['mediatype'] not in ['collection', 'web']
+        item_files = item.get_files(on_the_fly=False)
+        manifest = [self.parse_item_file(f) for f in item_files if self.want_item_file(f, item_name)]
+
+        return DatasetPlatformItem(
+            platform_name=self.platform_name,
+            platform_status='success',
+            manifest=manifest,
+            platform_domain='archive.org',
+            platform_id=item_name,
+            archiveorg_item_name=item_name,
+            archiveorg_collection=item_collection,
+        )
+
+    def chose_strategy(self, item: DatasetPlatformItem) -> IngestStrategy:
+        if len(item.manifest) == 1:
+            # NOTE: code flow does not support ArchiveorgFilesetBundle for the
+            # case of, eg, a single zipfile in an archive.org item
+            return IngestStrategy.ArchiveorgFile
+        elif len(item.manifest) >= 1:
+            return IngestStrategy.ArchiveorgFileset
+        else:
+            raise NotImplementedError()
+
+
+DATASET_PLATFORM_HELPER_TABLE = {
+    'dataverse': DataverseHelper(),
+    'archiveorg': ArchiveOrgHelper(),
+}
author	Bryan Newbold <bnewbold@archive.org>	2021-10-04 16:13:55 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2021-10-15 18:15:25 -0700
commit	b3447503c0aa2e326ce1e46c993be28f907ec23b (patch)
tree	4bd3de9016ecc95e38de5c75e6fd69b5ce26f74c /python/sandcrawler/fileset_platforms.py
parent	147319ae00a6b788104209083f65cbaa4329c862 (diff)
download	sandcrawler-b3447503c0aa2e326ce1e46c993be28f907ec23b.tar.gz sandcrawler-b3447503c0aa2e326ce1e46c993be28f907ec23b.zip