diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-04-15 12:42:42 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-04-15 13:39:28 -0700 |
commit | ed27bf6fb7afda158812c0822498bc7408572b8e (patch) | |
tree | a7a98ac1e9546b0360e4fdf58dfa4f0e4e626745 | |
parent | d11879e1c75a8fb1882dbb23533a458619185a9a (diff) | |
download | sandcrawler-ed27bf6fb7afda158812c0822498bc7408572b8e.tar.gz sandcrawler-ed27bf6fb7afda158812c0822498bc7408572b8e.zip |
COVID-19 chinese paper ingest
-rw-r--r-- | notes/ingest/2020-04-13_covid19.md | 73 | ||||
-rwxr-xr-x | python/scripts/covid2ingestrequest.py | 83 |
2 files changed, 156 insertions, 0 deletions
diff --git a/notes/ingest/2020-04-13_covid19.md b/notes/ingest/2020-04-13_covid19.md new file mode 100644 index 0000000..b442d69 --- /dev/null +++ b/notes/ingest/2020-04-13_covid19.md @@ -0,0 +1,73 @@ + +Want to ensure seedlists from Wanfang and CNKI are captured in wayback. + +Wanfang URLs seem normal. Let's just submit them in a single queue via SPNv2. +They are heterogenous after redirect. + +CNKI are trickier. The PDF URLs definitely can't be crawled directly... but the +info ones probably can, then crawl on to PDF? At least some seem to capture Ok. + +Need scope and identifiers for ingest requests. Let's do: + + cnki_covid19 / <ident> + wanfang_covid19 / <ident> + +Source: scrape-covid19 + +## Commands + + # in sandcrawler pipenv + cat ~/code/covid19.fatcat.wiki/extra/scrape/cnki_metadata.2020-04-14.json | ./scripts/covid2ingestrequest.py - > ~/code/covid19.fatcat.wiki/extra/scrape/cnki_ingest_request.2020-04-14.json + cat ~/code/covid19.fatcat.wiki/extra/scrape/wanfang*.2020-04-14.json | ./scripts/covid2ingestrequest.py - > ~/code/covid19.fatcat.wiki/extra/scrape/wanfang_ingest_request.2020-04-14.json + + + cat /tmp/wanfang_ingest_request.2020-04-14.json | shuf | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p 4 + cat /tmp/cnki_ingest_request.2020-04-14.json | shuf | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p 8 + +## Status + + SELECT ingest_request.ingest_type, + ingest_file_result.status, + COUNT(*) + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.ingest_request_source = 'scrape-covid19' + GROUP BY ingest_request.ingest_type, ingest_file_result.status + ORDER BY COUNT(*) DESC; + +2020-04-15: + + ingest_type | status | count + -------------+-------------------------------------+------- + pdf | spn2-cdx-lookup-failure | 1588 + pdf | success | 671 + pdf | gateway-timeout | 507 + pdf | no-pdf-link | 181 + pdf | wayback-error | 30 + pdf | spn2-error:job-failed | 20 + pdf | spn2-error | 7 + pdf | spn2-error:soft-time-limit-exceeded | 3 + pdf | spn2-error:pending | 2 + (9 rows) + +## Re-Try + + COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url + WHERE ingest_request.ingest_type = 'pdf' + AND ingest_request.ingest_request_source = 'scrape-covid19' + AND ingest_file_result.ingest_type = 'pdf' + AND ingest_file_result.hit = false + AND ingest_file_result.status != 'no-pdf-link' + AND ingest_file_result.status != 'link-loop' + ) TO '/grande/snapshots/reingest_covid19.rows.json'; + + ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_covid19.rows.json | shuf > reingest_covid19.json + + cat reingest_covid19.json | shuf | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p 9 + diff --git a/python/scripts/covid2ingestrequest.py b/python/scripts/covid2ingestrequest.py new file mode 100755 index 0000000..33c425d --- /dev/null +++ b/python/scripts/covid2ingestrequest.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python3 + +""" +Transform an unpaywall dump (JSON) into ingest requests. +""" + +import sys +import json +import argparse +import urlcanon + + +def canon(s): + parsed = urlcanon.parse_url(s) + return str(urlcanon.whatwg(parsed)) + + +def transform_cnki(obj): + + requests = [] + assert obj['cnki_id'] + + + requests = [] + requests.append({ + 'base_url': canon(obj['info_url']), + 'ingest_type': 'pdf', + 'link_source': 'cnki_covid19', + 'link_source_id': obj['cnki_id'], + 'ingest_request_source': 'scrape-covid19', + }) + if 'read_url' in obj: + requests.append({ + 'base_url': canon(obj['read_url']), + 'ingest_type': 'pdf', # actually HTML + 'link_source': 'cnki_covid19', + 'link_source_id': obj['cnki_id'], + 'ingest_request_source': 'scrape-covid19', + }) + + return requests + +def transform_wanfang(obj): + + assert obj['wanfang_id'] + return [{ + 'base_url': canon(obj['url']), + 'ingest_type': 'pdf', + 'link_source': 'wanfang_covid19', + 'link_source_id': obj['wanfang_id'], + 'ingest_request_source': 'scrape-covid19', + }] + + +def run(args): + for l in args.json_file: + if not l.strip(): + continue + row = json.loads(l) + + if 'wanfang_id' in row: + requests = transform_wanfang(row) or [] + elif 'cnki_id' in row: + requests = transform_cnki(row) or [] + else: + continue + for r in requests: + print("{}".format(json.dumps(r, sort_keys=True))) + +def main(): + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument('json_file', + help="COVID-19 metadata file to use", + type=argparse.FileType('r')) + subparsers = parser.add_subparsers() + + args = parser.parse_args() + + run(args) + +if __name__ == '__main__': + main() |