diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2021-10-12 14:44:44 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2021-10-14 18:11:12 -0700 |
commit | 5eccb38074104960d88df00805d0ebd7ecf839f9 (patch) | |
tree | a11b298e49e73686a5afc7bc7b34e1d0b0a86a79 /python/fatcat_tools | |
parent | 75baf7d423a2cb119bd485672a00fd664e32537c (diff) | |
download | fatcat-5eccb38074104960d88df00805d0ebd7ecf839f9.tar.gz fatcat-5eccb38074104960d88df00805d0ebd7ecf839f9.zip |
fileset ingest small tweaks
Diffstat (limited to 'python/fatcat_tools')
-rw-r--r-- | python/fatcat_tools/importers/ingest.py | 57 |
1 files changed, 36 insertions, 21 deletions
diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py index 38639297..36d72651 100644 --- a/python/fatcat_tools/importers/ingest.py +++ b/python/fatcat_tools/importers/ingest.py @@ -624,30 +624,45 @@ class IngestFilesetResultImporter(IngestFileResultImporter): # XXX: create URLs and rel for dataset ingest if not row.get('strategy'): return [] - if row['strategy'].startswith('archiveorg') and row.get('archiveorg_item_name'): - return [ - fatcat_openapi_client.FilesetUrl( - url=f"https://archive.org/download/{row['archiveorg_item_name']}", - rel="archive", - ) - ] - elif row['strategy'].startswith('web') and row.get('web_base_url'): - return [ - fatcat_openapi_client.FilesetUrl( - url=f"https://web.archive.org/web/{row['web_base_url_dt']}/{row['web_base_url']}", - rel="webarchive", - ) - ] - elif row['strategy'] == 'web-file-bundle' and row.get('web_bundle_url'): - return [ - fatcat_openapi_client.FilesetUrl( - url=f"https://web.archive.org/web/{row['web_bundle_url_dt']}/{row['web_bundle_url']}", - rel="webarchive", - ) - ] + urls = [] + if row['strategy'] == 'archiveorg-fileset' and row.get('archiveorg_item_name'): + urls.append(fatcat_openapi_client.FilesetUrl( + url=f"https://archive.org/download/{row['archiveorg_item_name']}/", + rel="archive", + )) + elif row['strategy'] == 'archiveorg-file-hundle' and row.get('archiveorg_item_name'): + # XXX: what is the filename of bundle? + urls.append(fatcat_openapi_client.FilesetUrl( + url=f"https://archive.org/download/{row['archiveorg_item_name']}/", + rel="archive", + )) + elif row['strategy'].startswith('web') and row.get('platform_base_url'): + urls.append(fatcat_openapi_client.FilesetUrl( + url=f"https://web.archive.org/web/{row['web_base_url_dt']}/{row['web_base_url']}", + rel="webarchive", + )) + elif row['strategy'] == 'web-file-bundle' and row.get('platform_bundle_url'): + urls.append(fatcat_openapi_client.FilesetUrl( + url=f"https://web.archive.org/web/{row['web_bundle_url_dt']}/{row['web_bundle_url']}", + rel="webarchive", + )) else: + # if no archival URLs, bail out return [] + # add any additional / platform URLs here + if row.get('platform_bundle_url'): + urls.append(fatcat_openapi_client.FilesetUrl( + url=row['platform_bundle_url'], + rel="repository-bundle", + )) + if row.get('platform_base_url'): + urls.append(fatcat_openapi_client.FilesetUrl( + url=row['platform_bundle_url'], + rel="repository", + )) + return urls + def parse_record(self, row): request = row['request'] |