aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat/matched_importer.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2018-09-14 16:53:28 -0700
committerBryan Newbold <bnewbold@robocracy.org>2018-09-14 16:53:28 -0700
commit03d7c929e1b415cbd612d612b9b1c9725f5690bb (patch)
tree40f65416f650f062243e524ff46a06f7b79d44b4 /python/fatcat/matched_importer.py
parent32ab9f040b313ce421620a2df71332e24c425cfc (diff)
downloadfatcat-03d7c929e1b415cbd612d612b9b1c9725f5690bb.tar.gz
fatcat-03d7c929e1b415cbd612d612b9b1c9725f5690bb.zip
switch manifest importer to be json-based
Diffstat (limited to 'python/fatcat/matched_importer.py')
-rw-r--r--python/fatcat/matched_importer.py13
1 files changed, 9 insertions, 4 deletions
diff --git a/python/fatcat/matched_importer.py b/python/fatcat/matched_importer.py
index 4b82b6b2..44735d52 100644
--- a/python/fatcat/matched_importer.py
+++ b/python/fatcat/matched_importer.py
@@ -25,10 +25,10 @@ class FatcatMatchedImporter(FatcatImporter):
- dt
- url
- mimetype
+ - urls (list of strings... or objects?)
Future handlings/extensions:
- core_id, wikidata_id, pmcid, pmid: not as lists
- - urls (list of strings... or objects?)
"""
def __init__(self, host_url, skip_file_update=False, default_mime=None,
@@ -42,6 +42,11 @@ class FatcatMatchedImporter(FatcatImporter):
rel = self.default_link_rel
# TODO: this is where we could map specific domains to rel types,
# and also filter out bad domains, invalid URLs, etc
+ if "//archive.org/" in url or "//arxiv.org/" in url:
+ # TODO: special-case the arxiv.org bulk mirror?
+ rel = "repository"
+ elif "//web.archive.org/" in url or "//archive.is/" in url:
+ rel = "webarchive"
return fatcat_client.FileEntityUrls(url=raw, rel=rel)
def parse_matched_dict(self, obj):
@@ -90,10 +95,10 @@ class FatcatMatchedImporter(FatcatImporter):
url = self.make_url(url)
if url != None:
fe.urls.append(url)
- if obj.get('cdx') != None:
- original = obj['cdx']['url']
+ for cdx in obj.get('cdx', []):
+ original = cdx['url']
wayback = "https://web.archive.org/web/{}/{}".format(
- obj['cdx']['dt'],
+ cdx['dt'],
original)
if wayback not in existing_urls:
fe.urls.append(