From e2e0602114ccdf142b3ef0f30c67d2cb7a58ef7e Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 4 Oct 2021 13:01:58 -0700 Subject: progress on fileset/dataset ingest --- python/sandcrawler/fileset_platforms.py | 39 +++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 python/sandcrawler/fileset_platforms.py (limited to 'python/sandcrawler/fileset_platforms.py') diff --git a/python/sandcrawler/fileset_platforms.py b/python/sandcrawler/fileset_platforms.py new file mode 100644 index 0000000..7aeacf2 --- /dev/null +++ b/python/sandcrawler/fileset_platforms.py @@ -0,0 +1,39 @@ + +import sys +import json +import gzip +import time +from collections import namedtuple +from typing import Optional, Tuple, Any, Dict, List + +from sandcrawler.html_metadata import BiblioMetadata +from sandcrawler.ia import ResourceResult + + +class DatasetPlatformHelper(class): + + def __init__(): + self.platform_name = 'unknown' + + def match_request(request: dict , resource: ResourceResult, html_biblio: Optional[BiblioMetadata]) -> bool: + """ + Does this request look like it matches this platform? + """ + raise NotImplemented + + def get_item(request: dict, resource: ResourceResult, html_biblio: Optional[BiblioMetadata]) -> DatasetPlatformItem: + """ + Fetch platform-specific metadata for this request (eg, via API calls) + """ + raise NotImplemented + + +class DataverseHelper(DatasetPlatformHelper): + + def __init__(): + self.platform_name = 'dataverse' + +class ArchiveOrgHelper(DatasetPlatformHelper): + + def __init__(): + self.platform_name = 'archiveorg' -- cgit v1.2.3