From 5ed0fdfecc8e458d2595794b887c5d9b3febef43 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 22 Apr 2019 14:48:09 -0700 Subject: matched importer shouldn't require wayback --- python/fatcat_tools/importers/matched.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'python') diff --git a/python/fatcat_tools/importers/matched.py b/python/fatcat_tools/importers/matched.py index ce2f4d57..7868fb75 100644 --- a/python/fatcat_tools/importers/matched.py +++ b/python/fatcat_tools/importers/matched.py @@ -22,7 +22,7 @@ class MatchedImporter(EntityImporter): - sha256 (hex) - size (int) - cdx (list of objects) - - dt + - dt (optional; if included creates wayback link) - url - mimetype - urls (list of strings... or objects?) @@ -77,15 +77,17 @@ class MatchedImporter(EntityImporter): urls.add(url) for cdx in obj.get('cdx', []): original = cdx['url'] - wayback = "https://web.archive.org/web/{}/{}".format( - cdx['dt'], - original) - urls.add(("webarchive", wayback)) + if cdx.get('dt'): + wayback = "https://web.archive.org/web/{}/{}".format( + cdx['dt'], + original) + urls.add(("webarchive", wayback)) url = make_rel_url(original, default_link_rel=self.default_link_rel) if url != None: urls.add(url) urls = [fatcat_client.FileEntityUrls(rel=rel, url=url) for (rel, url) in urls] if len(urls) == 0: + self.counts['skip-no-urls'] += 1 return None size = obj.get('size') -- cgit v1.2.3