diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2018-09-14 16:53:28 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2018-09-14 16:53:28 -0700 |
commit | 03d7c929e1b415cbd612d612b9b1c9725f5690bb (patch) | |
tree | 40f65416f650f062243e524ff46a06f7b79d44b4 /python/fatcat/matched_importer.py | |
parent | 32ab9f040b313ce421620a2df71332e24c425cfc (diff) | |
download | fatcat-03d7c929e1b415cbd612d612b9b1c9725f5690bb.tar.gz fatcat-03d7c929e1b415cbd612d612b9b1c9725f5690bb.zip |
switch manifest importer to be json-based
Diffstat (limited to 'python/fatcat/matched_importer.py')
-rw-r--r-- | python/fatcat/matched_importer.py | 13 |
1 files changed, 9 insertions, 4 deletions
diff --git a/python/fatcat/matched_importer.py b/python/fatcat/matched_importer.py index 4b82b6b2..44735d52 100644 --- a/python/fatcat/matched_importer.py +++ b/python/fatcat/matched_importer.py @@ -25,10 +25,10 @@ class FatcatMatchedImporter(FatcatImporter): - dt - url - mimetype + - urls (list of strings... or objects?) Future handlings/extensions: - core_id, wikidata_id, pmcid, pmid: not as lists - - urls (list of strings... or objects?) """ def __init__(self, host_url, skip_file_update=False, default_mime=None, @@ -42,6 +42,11 @@ class FatcatMatchedImporter(FatcatImporter): rel = self.default_link_rel # TODO: this is where we could map specific domains to rel types, # and also filter out bad domains, invalid URLs, etc + if "//archive.org/" in url or "//arxiv.org/" in url: + # TODO: special-case the arxiv.org bulk mirror? + rel = "repository" + elif "//web.archive.org/" in url or "//archive.is/" in url: + rel = "webarchive" return fatcat_client.FileEntityUrls(url=raw, rel=rel) def parse_matched_dict(self, obj): @@ -90,10 +95,10 @@ class FatcatMatchedImporter(FatcatImporter): url = self.make_url(url) if url != None: fe.urls.append(url) - if obj.get('cdx') != None: - original = obj['cdx']['url'] + for cdx in obj.get('cdx', []): + original = cdx['url'] wayback = "https://web.archive.org/web/{}/{}".format( - obj['cdx']['dt'], + cdx['dt'], original) if wayback not in existing_urls: fe.urls.append( |