COVID-19 chinese paper ingest

author: Bryan Newbold <bnewbold@archive.org> 2020-04-15 12:42:42 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2020-04-15 13:39:28 -0700
commit: ed27bf6fb7afda158812c0822498bc7408572b8e (patch)
tree: a7a98ac1e9546b0360e4fdf58dfa4f0e4e626745
parent: d11879e1c75a8fb1882dbb23533a458619185a9a (diff)
download: sandcrawler-ed27bf6fb7afda158812c0822498bc7408572b8e.tar.gz
sandcrawler-ed27bf6fb7afda158812c0822498bc7408572b8e.zip
2 files changed, 156 insertions, 0 deletions
diff --git a/notes/ingest/2020-04-13_covid19.md b/notes/ingest/2020-04-13_covid19.md
new file mode 100644
index 0000000..b442d69
--- /dev/null
+++ b/notes/ingest/2020-04-13_covid19.md
@@ -0,0 +1,73 @@
+
+Want to ensure seedlists from Wanfang and CNKI are captured in wayback.
+
+Wanfang URLs seem normal. Let's just submit them in a single queue via SPNv2.
+They are heterogenous after redirect.
+
+CNKI are trickier. The PDF URLs definitely can't be crawled directly... but the
+info ones probably can, then crawl on to PDF? At least some seem to capture Ok.
+
+Need scope and identifiers for ingest requests. Let's do:
+
+    cnki_covid19 / <ident>
+    wanfang_covid19 / <ident>
+
+Source: scrape-covid19
+
+## Commands
+
+    # in sandcrawler pipenv
+    cat ~/code/covid19.fatcat.wiki/extra/scrape/cnki_metadata.2020-04-14.json | ./scripts/covid2ingestrequest.py - > ~/code/covid19.fatcat.wiki/extra/scrape/cnki_ingest_request.2020-04-14.json
+    cat ~/code/covid19.fatcat.wiki/extra/scrape/wanfang*.2020-04-14.json | ./scripts/covid2ingestrequest.py - > ~/code/covid19.fatcat.wiki/extra/scrape/wanfang_ingest_request.2020-04-14.json
+
+
+    cat /tmp/wanfang_ingest_request.2020-04-14.json | shuf | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p 4
+    cat /tmp/cnki_ingest_request.2020-04-14.json | shuf | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p 8
+
+## Status
+
+    SELECT ingest_request.ingest_type,
+           ingest_file_result.status,
+           COUNT(*)
+    FROM ingest_file_result
+    LEFT JOIN ingest_request
+        ON ingest_file_result.ingest_type = ingest_request.ingest_type
+        AND ingest_file_result.base_url = ingest_request.base_url
+    WHERE
+        ingest_request.ingest_type = 'pdf'
+        AND ingest_request.ingest_request_source = 'scrape-covid19'
+    GROUP BY ingest_request.ingest_type, ingest_file_result.status
+    ORDER BY COUNT(*) DESC;
+
+2020-04-15:
+
+     ingest_type |               status                | count
+    -------------+-------------------------------------+-------
+     pdf         | spn2-cdx-lookup-failure             |  1588
+     pdf         | success                             |   671
+     pdf         | gateway-timeout                     |   507
+     pdf         | no-pdf-link                         |   181
+     pdf         | wayback-error                       |    30
+     pdf         | spn2-error:job-failed               |    20
+     pdf         | spn2-error                          |     7
+     pdf         | spn2-error:soft-time-limit-exceeded |     3
+     pdf         | spn2-error:pending                  |     2
+    (9 rows)
+
+## Re-Try
+
+    COPY (
+        SELECT row_to_json(ingest_request.*) FROM ingest_request
+        LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url
+        WHERE ingest_request.ingest_type = 'pdf'
+            AND ingest_request.ingest_request_source = 'scrape-covid19'
+            AND ingest_file_result.ingest_type = 'pdf'
+            AND ingest_file_result.hit = false
+            AND ingest_file_result.status != 'no-pdf-link'
+            AND ingest_file_result.status != 'link-loop'
+    ) TO '/grande/snapshots/reingest_covid19.rows.json';
+
+    ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_covid19.rows.json | shuf > reingest_covid19.json
+
+    cat reingest_covid19.json | shuf | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p 9
+
diff --git a/python/scripts/covid2ingestrequest.py b/python/scripts/covid2ingestrequest.py
new file mode 100755
index 0000000..33c425d
--- /dev/null
+++ b/python/scripts/covid2ingestrequest.py
@@ -0,0 +1,83 @@
+#!/usr/bin/env python3
+
+"""
+Transform an unpaywall dump (JSON) into ingest requests.
+"""
+
+import sys
+import json
+import argparse
+import urlcanon
+
+
+def canon(s):
+    parsed = urlcanon.parse_url(s)
+    return str(urlcanon.whatwg(parsed))
+
+
+def transform_cnki(obj):
+
+    requests = []
+    assert obj['cnki_id']
+
+
+    requests = []
+    requests.append({
+        'base_url': canon(obj['info_url']),
+        'ingest_type': 'pdf',
+        'link_source': 'cnki_covid19',
+        'link_source_id': obj['cnki_id'],
+        'ingest_request_source': 'scrape-covid19',
+    })
+    if 'read_url' in obj:
+        requests.append({
+            'base_url': canon(obj['read_url']),
+            'ingest_type': 'pdf',  # actually HTML
+            'link_source': 'cnki_covid19',
+            'link_source_id': obj['cnki_id'],
+            'ingest_request_source': 'scrape-covid19',
+        })
+
+    return requests
+
+def transform_wanfang(obj):
+
+    assert obj['wanfang_id']
+    return [{
+        'base_url': canon(obj['url']),
+        'ingest_type': 'pdf',
+        'link_source': 'wanfang_covid19',
+        'link_source_id': obj['wanfang_id'],
+        'ingest_request_source': 'scrape-covid19',
+    }]
+
+
+def run(args):
+    for l in args.json_file:
+        if not l.strip():
+            continue
+        row = json.loads(l)
+
+        if 'wanfang_id' in row:
+            requests = transform_wanfang(row) or []
+        elif 'cnki_id' in row:
+            requests = transform_cnki(row) or []
+        else:
+            continue
+        for r in requests:
+            print("{}".format(json.dumps(r, sort_keys=True)))
+
+def main():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument('json_file',
+        help="COVID-19 metadata file to use",
+        type=argparse.FileType('r'))
+    subparsers = parser.add_subparsers()
+
+    args = parser.parse_args()
+
+    run(args)
+
+if __name__ == '__main__':
+    main()
author	Bryan Newbold <bnewbold@archive.org>	2020-04-15 12:42:42 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2020-04-15 13:39:28 -0700
commit	ed27bf6fb7afda158812c0822498bc7408572b8e (patch)
tree	a7a98ac1e9546b0360e4fdf58dfa4f0e4e626745
parent	d11879e1c75a8fb1882dbb23533a458619185a9a (diff)
download	sandcrawler-ed27bf6fb7afda158812c0822498bc7408572b8e.tar.gz sandcrawler-ed27bf6fb7afda158812c0822498bc7408572b8e.zip