summaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/importers
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2019-04-22 14:48:09 -0700
committerBryan Newbold <bnewbold@robocracy.org>2019-04-22 14:48:09 -0700
commit5ed0fdfecc8e458d2595794b887c5d9b3febef43 (patch)
tree8dd17c40e7514548ce4e657c467466ac74b186f6 /python/fatcat_tools/importers
parent88a51468cfb85b0607a3f5fe28ddafca46e104c2 (diff)
downloadfatcat-5ed0fdfecc8e458d2595794b887c5d9b3febef43.tar.gz
fatcat-5ed0fdfecc8e458d2595794b887c5d9b3febef43.zip
matched importer shouldn't require wayback
Diffstat (limited to 'python/fatcat_tools/importers')
-rw-r--r--python/fatcat_tools/importers/matched.py12
1 files changed, 7 insertions, 5 deletions
diff --git a/python/fatcat_tools/importers/matched.py b/python/fatcat_tools/importers/matched.py
index ce2f4d57..7868fb75 100644
--- a/python/fatcat_tools/importers/matched.py
+++ b/python/fatcat_tools/importers/matched.py
@@ -22,7 +22,7 @@ class MatchedImporter(EntityImporter):
- sha256 (hex)
- size (int)
- cdx (list of objects)
- - dt
+ - dt (optional; if included creates wayback link)
- url
- mimetype
- urls (list of strings... or objects?)
@@ -77,15 +77,17 @@ class MatchedImporter(EntityImporter):
urls.add(url)
for cdx in obj.get('cdx', []):
original = cdx['url']
- wayback = "https://web.archive.org/web/{}/{}".format(
- cdx['dt'],
- original)
- urls.add(("webarchive", wayback))
+ if cdx.get('dt'):
+ wayback = "https://web.archive.org/web/{}/{}".format(
+ cdx['dt'],
+ original)
+ urls.add(("webarchive", wayback))
url = make_rel_url(original, default_link_rel=self.default_link_rel)
if url != None:
urls.add(url)
urls = [fatcat_client.FileEntityUrls(rel=rel, url=url) for (rel, url) in urls]
if len(urls) == 0:
+ self.counts['skip-no-urls'] += 1
return None
size = obj.get('size')