diff options
| author | Bryan Newbold <bnewbold@robocracy.org> | 2019-04-22 14:48:09 -0700 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@robocracy.org> | 2019-04-22 14:48:09 -0700 | 
| commit | 5ed0fdfecc8e458d2595794b887c5d9b3febef43 (patch) | |
| tree | 8dd17c40e7514548ce4e657c467466ac74b186f6 | |
| parent | 88a51468cfb85b0607a3f5fe28ddafca46e104c2 (diff) | |
| download | fatcat-5ed0fdfecc8e458d2595794b887c5d9b3febef43.tar.gz fatcat-5ed0fdfecc8e458d2595794b887c5d9b3febef43.zip | |
matched importer shouldn't require wayback
| -rw-r--r-- | python/fatcat_tools/importers/matched.py | 12 | 
1 files changed, 7 insertions, 5 deletions
| diff --git a/python/fatcat_tools/importers/matched.py b/python/fatcat_tools/importers/matched.py index ce2f4d57..7868fb75 100644 --- a/python/fatcat_tools/importers/matched.py +++ b/python/fatcat_tools/importers/matched.py @@ -22,7 +22,7 @@ class MatchedImporter(EntityImporter):      - sha256 (hex)      - size (int)      - cdx (list of objects) -        - dt +        - dt (optional; if included creates wayback link)          - url      - mimetype      - urls (list of strings... or objects?) @@ -77,15 +77,17 @@ class MatchedImporter(EntityImporter):                  urls.add(url)          for cdx in obj.get('cdx', []):              original = cdx['url'] -            wayback = "https://web.archive.org/web/{}/{}".format( -                cdx['dt'], -                original) -            urls.add(("webarchive", wayback)) +            if cdx.get('dt'): +                wayback = "https://web.archive.org/web/{}/{}".format( +                    cdx['dt'], +                    original) +                urls.add(("webarchive", wayback))              url = make_rel_url(original, default_link_rel=self.default_link_rel)              if url != None:                  urls.add(url)          urls = [fatcat_client.FileEntityUrls(rel=rel, url=url) for (rel, url) in urls]          if len(urls) == 0: +            self.counts['skip-no-urls'] += 1              return None          size = obj.get('size') | 
