diff options
Diffstat (limited to 'notes/cleanups/wayback_timestamps.md')
-rw-r--r-- | notes/cleanups/wayback_timestamps.md | 24 |
1 files changed, 24 insertions, 0 deletions
diff --git a/notes/cleanups/wayback_timestamps.md b/notes/cleanups/wayback_timestamps.md index 85e5f94f..38bc3f7d 100644 --- a/notes/cleanups/wayback_timestamps.md +++ b/notes/cleanups/wayback_timestamps.md @@ -265,3 +265,27 @@ Running in bulk again: | gzip \ > files_20211105_moreshortts.fetched.json.gz +Ran in to one: `requests.exceptions.HTTPError: 503 Server Error: Service +Temporarily Unavailable for url: [...]`. Will try again, if there are more +failures may need to split up in smaller chunks. + +Unexpected: + + Traceback (most recent call last): + File "./fetch_full_cdx_ts.py", line 200, in <module> + main() + File "./fetch_full_cdx_ts.py", line 197, in main + print(json.dumps(process_file(fe, session=session))) + File "./fetch_full_cdx_ts.py", line 118, in process_file + assert seg[4].isdigit() + AssertionError + 3.96M 3:04:46 [ 357 /s] + +Ugh. + + zcat files_20211105_moreshortts.json.gz \ + | tac \ + | parallel -j8 --linebuffer --round-robin --pipe ./fetch_full_cdx_ts.py \ + | pv -l \ + | gzip \ + > files_20211105_moreshortts.fetched.json.gz |