diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2019-04-22 14:48:09 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2019-04-22 14:48:09 -0700 |
commit | 5ed0fdfecc8e458d2595794b887c5d9b3febef43 (patch) | |
tree | 8dd17c40e7514548ce4e657c467466ac74b186f6 | |
parent | 88a51468cfb85b0607a3f5fe28ddafca46e104c2 (diff) | |
download | fatcat-5ed0fdfecc8e458d2595794b887c5d9b3febef43.tar.gz fatcat-5ed0fdfecc8e458d2595794b887c5d9b3febef43.zip |
matched importer shouldn't require wayback
-rw-r--r-- | python/fatcat_tools/importers/matched.py | 12 |
1 files changed, 7 insertions, 5 deletions
diff --git a/python/fatcat_tools/importers/matched.py b/python/fatcat_tools/importers/matched.py index ce2f4d57..7868fb75 100644 --- a/python/fatcat_tools/importers/matched.py +++ b/python/fatcat_tools/importers/matched.py @@ -22,7 +22,7 @@ class MatchedImporter(EntityImporter): - sha256 (hex) - size (int) - cdx (list of objects) - - dt + - dt (optional; if included creates wayback link) - url - mimetype - urls (list of strings... or objects?) @@ -77,15 +77,17 @@ class MatchedImporter(EntityImporter): urls.add(url) for cdx in obj.get('cdx', []): original = cdx['url'] - wayback = "https://web.archive.org/web/{}/{}".format( - cdx['dt'], - original) - urls.add(("webarchive", wayback)) + if cdx.get('dt'): + wayback = "https://web.archive.org/web/{}/{}".format( + cdx['dt'], + original) + urls.add(("webarchive", wayback)) url = make_rel_url(original, default_link_rel=self.default_link_rel) if url != None: urls.add(url) urls = [fatcat_client.FileEntityUrls(rel=rel, url=url) for (rel, url) in urls] if len(urls) == 0: + self.counts['skip-no-urls'] += 1 return None size = obj.get('size') |