From 1d24518ddd1b61d8291af2b8ca5b1a5ac7ef705b Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Mon, 21 Jun 2021 20:03:45 +0200 Subject: update script, notes --- extra/cdx/cdx_query.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'extra/cdx') diff --git a/extra/cdx/cdx_query.py b/extra/cdx/cdx_query.py index 9b7a1f0..b41ae53 100644 --- a/extra/cdx/cdx_query.py +++ b/extra/cdx/cdx_query.py @@ -9,31 +9,29 @@ import requests import sys Cdx = collections.namedtuple("Cdx", "surt date url mime status hash size") +CdxApi = "http://web.archive.org/cdx/search/cdx" +okStatus = ('200', '301', '302', '303') def parse_cdx_lines(blob): result = [] lines = blob.split("\n") - cutoff = ["access", "Access", "abgeruf", "aufgeruf"] for line in lines: - for c in cutoff: - if c in line: - line = line[:line.index(c)] - break fields = line.strip().split() if len(fields) == 0: continue cdx = Cdx(*fields) result.append(cdx) result = sorted(result, key=lambda cdx: cdx.date, reverse=True) - result = list(filter(lambda cdx: cdx.status in ("200", "301", "302", "303"), result)) + result = list(filter(lambda cdx: cdx.status in okStatus, result)) return result + def main(): stats = collections.Counter() for line in fileinput.input(): line = line.strip() - r = requests.get("http://web.archive.org/cdx/search/cdx?url={}".format(line)) + r = requests.get("{}?url={}".format(CdxApi, line)) if not r.ok: continue try: @@ -49,9 +47,9 @@ def main(): stats["ok"] += 1 print("OK\t{}\t{}".format(cdx_lines[0].date, line)) - return stats -if __name__ == '__main__': + +if __name__ == "__main__": stats = main() print(stats) -- cgit v1.2.3