aboutsummaryrefslogtreecommitdiffstats
path: root/extra/cdx
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-06-21 20:03:45 +0200
committerMartin Czygan <martin.czygan@gmail.com>2021-06-21 20:03:45 +0200
commit1d24518ddd1b61d8291af2b8ca5b1a5ac7ef705b (patch)
tree05dad31420f1142f143209f0cfe36e2e8249f8a0 /extra/cdx
parente82983633b58ead6ed2ce82e36a17af227d5f5ed (diff)
downloadrefcat-1d24518ddd1b61d8291af2b8ca5b1a5ac7ef705b.tar.gz
refcat-1d24518ddd1b61d8291af2b8ca5b1a5ac7ef705b.zip
update script, notes
Diffstat (limited to 'extra/cdx')
-rw-r--r--extra/cdx/cdx_query.py16
1 files changed, 7 insertions, 9 deletions
diff --git a/extra/cdx/cdx_query.py b/extra/cdx/cdx_query.py
index 9b7a1f0..b41ae53 100644
--- a/extra/cdx/cdx_query.py
+++ b/extra/cdx/cdx_query.py
@@ -9,31 +9,29 @@ import requests
import sys
Cdx = collections.namedtuple("Cdx", "surt date url mime status hash size")
+CdxApi = "http://web.archive.org/cdx/search/cdx"
+okStatus = ('200', '301', '302', '303')
def parse_cdx_lines(blob):
result = []
lines = blob.split("\n")
- cutoff = ["access", "Access", "abgeruf", "aufgeruf"]
for line in lines:
- for c in cutoff:
- if c in line:
- line = line[:line.index(c)]
- break
fields = line.strip().split()
if len(fields) == 0:
continue
cdx = Cdx(*fields)
result.append(cdx)
result = sorted(result, key=lambda cdx: cdx.date, reverse=True)
- result = list(filter(lambda cdx: cdx.status in ("200", "301", "302", "303"), result))
+ result = list(filter(lambda cdx: cdx.status in okStatus, result))
return result
+
def main():
stats = collections.Counter()
for line in fileinput.input():
line = line.strip()
- r = requests.get("http://web.archive.org/cdx/search/cdx?url={}".format(line))
+ r = requests.get("{}?url={}".format(CdxApi, line))
if not r.ok:
continue
try:
@@ -49,9 +47,9 @@ def main():
stats["ok"] += 1
print("OK\t{}\t{}".format(cdx_lines[0].date, line))
-
return stats
-if __name__ == '__main__':
+
+if __name__ == "__main__":
stats = main()
print(stats)