aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/fileset_platforms.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-10-11 11:02:01 -0700
committerBryan Newbold <bnewbold@archive.org>2021-10-15 18:15:29 -0700
commitcc9c63d6b9d07c9d192c32e107254932f4b4a66b (patch)
treeb9c16f18db61e4f245c305dab0ed261ac86202f7 /python/sandcrawler/fileset_platforms.py
parent2e285e469251125ee70bc4c3408dbbcad8701b2c (diff)
downloadsandcrawler-cc9c63d6b9d07c9d192c32e107254932f4b4a66b.tar.gz
sandcrawler-cc9c63d6b9d07c9d192c32e107254932f4b4a66b.zip
fileset ingest: improve platform parsing
Diffstat (limited to 'python/sandcrawler/fileset_platforms.py')
-rw-r--r--python/sandcrawler/fileset_platforms.py208
1 files changed, 196 insertions, 12 deletions
diff --git a/python/sandcrawler/fileset_platforms.py b/python/sandcrawler/fileset_platforms.py
index 15976f5..5f2f743 100644
--- a/python/sandcrawler/fileset_platforms.py
+++ b/python/sandcrawler/fileset_platforms.py
@@ -56,10 +56,71 @@ class DataverseHelper(DatasetPlatformHelper):
def __init__(self):
self.platform_name = 'dataverse'
self.session = requests.Session()
- self.dataverse_domain_allowlist = [
- 'dataverse.harvard.edu',
- 'data.lipi.go.id',
- ]
+
+ @staticmethod
+ def parse_dataverse_persistentid(pid: str) -> dict:
+ """
+ Parses a persistentId into 5 sections:
+
+ - type (doi or hdl)
+ - authority (eg, DOI prefix)
+ - shoulder (optional, eg 'DVN')
+ - dataset_id (6-digit)
+ - file_id
+
+ The returned dict always has all components, which may be 'None' if optional.
+
+ This is possible because the dataverse software only supports a handful
+ of configurations and persistend identifier types.
+
+ If there is an error parsing, raises a ValueError
+ """
+ id_type = None
+ if pid.startswith('doi:10.'):
+ id_type = 'doi'
+ pid = pid[4:]
+ elif pid.startswith('hdl:'):
+ id_type = 'hdl'
+ pid = pid[4:]
+ else:
+ raise ValueError(f"unknown dataverse persistentId format: {pid}")
+
+ comp = pid.split('/')
+ if len(comp) < 2:
+ raise ValueError(f"unknown dataverse persistentId format: {pid}")
+
+ authority = comp[0]
+ shoulder = None
+ dataset_id = None
+ file_id = None
+ if len(comp[1]) != 6 and len(comp) == 3:
+ shoulder = comp[1]
+ dataset_id = comp[2]
+ elif len(comp[1]) != 6 and len(comp) == 4:
+ shoulder = comp[1]
+ dataset_id = comp[2]
+ file_id = comp[3]
+ elif len(comp[1]) == 6 and len(comp) == 2:
+ dataset_id = comp[1]
+ elif len(comp[1]) == 6 and len(comp) == 3:
+ dataset_id = comp[1]
+ file_id = comp[2]
+ else:
+ raise ValueError(f"unknown dataverse persistentId format: {pid}")
+
+ if len(dataset_id) != 6:
+ raise ValueError(f"expected a 6-digit dataverse dataset id: {dataset_id}")
+ if file_id and len(file_id) != 6:
+ raise ValueError(f"expected a 6-digit dataverse file id: {file_id}")
+
+ return {
+ "type": id_type,
+ "authority": authority,
+ "shoulder": shoulder,
+ "dataset_id": dataset_id,
+ "file_id": file_id,
+ }
+
def match_request(self, request: dict , resource: Optional[ResourceResult], html_biblio: Optional[BiblioMetadata]) -> bool:
if resource and resource.terminal_url:
@@ -67,19 +128,22 @@ class DataverseHelper(DatasetPlatformHelper):
else:
url = request['base_url']
+ # TODO: could also do HTML platform detection or something?
+
components = urllib.parse.urlparse(url)
platform_domain = components.netloc.split(':')[0].lower()
params = urllib.parse.parse_qs(components.query)
platform_id = params.get('persistentId')
-
- if not platform_domain in self.dataverse_domain_allowlist:
- return False
if not platform_id:
return False
+ platform_id = platform_id[0]
- if html_biblio and 'dataverse' in html_biblio.publisher.lower():
- return True
- return False
+ try:
+ parsed = self.parse_dataverse_persistentid(platform_id)
+ except ValueError:
+ return False
+
+ return True
def process_request(self, request: dict, resource: Optional[ResourceResult], html_biblio: Optional[BiblioMetadata]) -> DatasetPlatformItem:
"""
@@ -192,12 +256,99 @@ class DataverseHelper(DatasetPlatformHelper):
extra=dict(version=dataset_version),
)
+def test_parse_dataverse_persistentid():
+
+ valid = {
+ "doi:10.25625/LL6WXZ": {
+ "type": "doi",
+ "authority": "10.25625",
+ "shoulder": None,
+ "dataset_id": "LL6WXZ",
+ "file_id": None,
+ },
+ "doi:10.25625/LL6WXZ": {
+ "type": "doi",
+ "authority": "10.25625",
+ "shoulder": None,
+ "dataset_id": "LL6WXZ",
+ "file_id": None,
+ },
+ "doi:10.5072/FK2/J8SJZB": {
+ "type": "doi",
+ "authority": "10.5072",
+ "shoulder": "FK2",
+ "dataset_id": "J8SJZB",
+ "file_id": None,
+ },
+ "doi:10.5072/FK2/J8SJZB/LL6WXZ": {
+ "type": "doi",
+ "authority": "10.5072",
+ "shoulder": "FK2",
+ "dataset_id": "J8SJZB",
+ "file_id": "LL6WXZ",
+ },
+ "hdl:20.500.12690/RIN/IDDOAH/BTNH25": {
+ "type": "hdl",
+ "authority": "20.500.12690",
+ "shoulder": "RIN",
+ "dataset_id": "IDDOAH",
+ "file_id": "BTNH25",
+ },
+ "doi:10.7910/DVN/6HPRIG": {
+ "type": "doi",
+ "authority": "10.7910",
+ "shoulder": "DVN",
+ "dataset_id": "6HPRIG",
+ "file_id": None,
+ },
+ }
+
+ invalid = [
+ #"doi:10.5072/FK2/J8SJZB/LL6WXZ",
+ "doi:10.25625/abcd",
+ "other:10.25625/LL6WXZ",
+ "10.25625/LL6WXZ",
+ "doi:10.5072/FK2/J8SJZB/LL6WXZv123",
+ ]
+
+ for pid, val in valid.items():
+ assert DataverseHelper.parse_dataverse_persistentid(pid) == val
+
+ for pid in invalid:
+ try:
+ DataverseHelper.parse_dataverse_persistentid(pid)
+ assert False, "should not get here"
+ except ValueError:
+ pass
+
class FigshareHelper(DatasetPlatformHelper):
def __init__(self):
self.platform_name = 'figshare'
self.session = requests.Session()
+ @staticmethod
+ def parse_figshare_url_path(path: str) -> List[str]:
+ """
+ Tries to parse a figshare URL into ID number and (optional) version number.
+
+ Returns a two-element list; version number will be None if not found
+
+ Raises a ValueError if not a figshare URL
+ """
+ # eg: /articles/Optimized_protocol_to_isolate_high_quality_genomic_DNA_from_different_tissues_of_a_palm_species/8987858/1
+
+ comp = path.split('/')
+ if len(comp) < 4 or comp[1] != 'articles':
+ raise ValueError
+
+ if len(comp) == 5 and comp[3].isdigit() and comp[4].isdigit():
+ return (comp[3], comp[4])
+ elif len(comp) == 4 and comp[3].isdigit():
+ return (comp[3], None)
+ else:
+ raise ValueError
+
def match_request(self, request: dict , resource: Optional[ResourceResult], html_biblio: Optional[BiblioMetadata]) -> bool:
if resource and resource.terminal_url:
@@ -207,9 +358,20 @@ class FigshareHelper(DatasetPlatformHelper):
components = urllib.parse.urlparse(url)
platform_domain = components.netloc.split(':')[0].lower()
- # only work with full, versioned figshare URLs
- if 'figshare.com' in platform_domain and '/articles/' in components.path and len(components.path.split('/')) >= 6:
+
+ # only work with full, versioned figshare.com URLs
+ if not 'figshare.com' in platform_domain:
+ return False
+
+ try:
+ parsed = self.parse_figshare_url_path(components.path)
+ except ValueError:
+ return False
+
+ # has file component
+ if parsed[0] and parsed[1]:
return True
+
return False
def process_request(self, request: dict, resource: Optional[ResourceResult], html_biblio: Optional[BiblioMetadata]) -> DatasetPlatformItem:
@@ -287,6 +449,28 @@ class FigshareHelper(DatasetPlatformHelper):
extra=dict(version=dataset_version),
)
+def test_parse_figshare_url_path():
+
+ valid = {
+ "/articles/Optimized_protocol_to_isolate_high_quality_genomic_DNA_from_different_tissues_of_a_palm_species/8987858/1": ("8987858", "1"),
+ "/articles/Optimized_protocol_to_isolate_high_quality_genomic_DNA_from_different_tissues_of_a_palm_species/8987858": ("8987858", None),
+ "/articles/CIBERSORT_p-value_0_05/8217188/1": ("8217188", "1"),
+ }
+
+ invalid = [
+ "/articles/Optimized_protocol_to_isolate_high_quality_genomic_DNA_from_different_tissues_of_a_palm_species",
+ ]
+
+ for path, val in valid.items():
+ assert FigshareHelper.parse_figshare_url_path(path) == val
+
+ for path in invalid:
+ try:
+ FigshareHelper.parse_figshare_url_path(path)
+ assert False, "should not get here"
+ except ValueError:
+ pass
+
class ZenodoHelper(DatasetPlatformHelper):
def __init__(self):