aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/fileset_platforms.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-10-04 13:01:58 -0700
committerBryan Newbold <bnewbold@archive.org>2021-10-15 18:15:20 -0700
commite2e0602114ccdf142b3ef0f30c67d2cb7a58ef7e (patch)
treebee2b4343f30d59fec463dbdcaafafc11d7cd513 /python/sandcrawler/fileset_platforms.py
parent452475df7619f3743eac5ad86e2e1fb8ba9972da (diff)
downloadsandcrawler-e2e0602114ccdf142b3ef0f30c67d2cb7a58ef7e.tar.gz
sandcrawler-e2e0602114ccdf142b3ef0f30c67d2cb7a58ef7e.zip
progress on fileset/dataset ingest
Diffstat (limited to 'python/sandcrawler/fileset_platforms.py')
-rw-r--r--python/sandcrawler/fileset_platforms.py39
1 files changed, 39 insertions, 0 deletions
diff --git a/python/sandcrawler/fileset_platforms.py b/python/sandcrawler/fileset_platforms.py
new file mode 100644
index 0000000..7aeacf2
--- /dev/null
+++ b/python/sandcrawler/fileset_platforms.py
@@ -0,0 +1,39 @@
+
+import sys
+import json
+import gzip
+import time
+from collections import namedtuple
+from typing import Optional, Tuple, Any, Dict, List
+
+from sandcrawler.html_metadata import BiblioMetadata
+from sandcrawler.ia import ResourceResult
+
+
+class DatasetPlatformHelper(class):
+
+ def __init__():
+ self.platform_name = 'unknown'
+
+ def match_request(request: dict , resource: ResourceResult, html_biblio: Optional[BiblioMetadata]) -> bool:
+ """
+ Does this request look like it matches this platform?
+ """
+ raise NotImplemented
+
+ def get_item(request: dict, resource: ResourceResult, html_biblio: Optional[BiblioMetadata]) -> DatasetPlatformItem:
+ """
+ Fetch platform-specific metadata for this request (eg, via API calls)
+ """
+ raise NotImplemented
+
+
+class DataverseHelper(DatasetPlatformHelper):
+
+ def __init__():
+ self.platform_name = 'dataverse'
+
+class ArchiveOrgHelper(DatasetPlatformHelper):
+
+ def __init__():
+ self.platform_name = 'archiveorg'