start 2020-10 ingest notes

author: Bryan Newbold <bnewbold@archive.org> 2020-10-11 21:43:23 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2020-10-11 21:43:23 -0700
commit: 2469d483326e5e81e774e46fd100888710b9bbc3 (patch)
tree: 4b127d95f0a4dd7a978a32b75b64a318b64feb41 /notes/ingest
parent: 3315e52d32701492c81758e2d297dbb501e17bc9 (diff)
download: sandcrawler-2469d483326e5e81e774e46fd100888710b9bbc3.tar.gz
sandcrawler-2469d483326e5e81e774e46fd100888710b9bbc3.zip
1 files changed, 42 insertions, 0 deletions
diff --git a/notes/ingest/2020-10_unpaywall.md b/notes/ingest/2020-10_unpaywall.md
new file mode 100644
index 0000000..d474176
--- /dev/null
+++ b/notes/ingest/2020-10_unpaywall.md
@@ -0,0 +1,42 @@
+
+New snapshot released 2020-10-09. Want to do a mostly straight-forward
+load/ingest/crawl.
+
+Proposed changes this time around:
+
+- have bulk ingest store missing URLs in a new sandcrawler-db for `no-capture`
+  status, and to include those URLs in heritrix3 crawl
+- tweak heritrix3 config for additional PDF URL extraction patterns,
+  particularly to improve OJS yield
+
+
+## Transform and Load
+
+    # in sandcrawler pipenv on aitio
+    zcat /schnell/unpaywall/unpaywall_snapshot_2020-10-09T153852.jsonl.gz | ./scripts/unpaywall2ingestrequest.py - | pv -l > /grande/snapshots/unpaywall_snapshot_2020-10-09.ingest_request.json
+    => 28.3M 3:19:03 [2.37k/s]
+
+    cat /grande/snapshots/unpaywall_snapshot_2020-04-27.ingest_request.json | pv -l | ./persist_tool.py ingest-request -
+    => 28.3M 1:11:29 [ 6.6k/s]
+    => Worker: Counter({'total': 28298500, 'insert-requests': 4119939, 'update-requests': 0})
+    => JSON lines pushed: Counter({'total': 28298500, 'pushed': 28298500})
+
+## Dump new URLs, Transform, Bulk Ingest
+
+    COPY (
+        SELECT row_to_json(ingest_request.*)
+        FROM ingest_request
+        LEFT JOIN ingest_file_result
+            ON ingest_file_result.ingest_type = ingest_request.ingest_type
+            AND ingest_file_result.base_url = ingest_request.base_url
+        WHERE
+            ingest_request.ingest_type = 'pdf'
+            AND ingest_request.link_source = 'unpaywall'
+            -- AND date(ingest_request.created) > '2020-10-09'
+            AND (ingest_file_result.status IS NULL
+                OR ingest_file_result.status = 'no-capture')
+    ) TO '/grande/snapshots/unpaywall_noingest_2020-10-09.rows.json';
+
+    ./scripts/ingestrequest_row2json.py /grande/snapshots/unpaywall_noingest_2020-10-09.rows.json | pv -l | shuf > /grande/snapshots/unpaywall_noingest_2020-10-09.ingest_request.json
+
+
author	Bryan Newbold <bnewbold@archive.org>	2020-10-11 21:43:23 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2020-10-11 21:43:23 -0700
commit	2469d483326e5e81e774e46fd100888710b9bbc3 (patch)
tree	4b127d95f0a4dd7a978a32b75b64a318b64feb41 /notes/ingest
parent	3315e52d32701492c81758e2d297dbb501e17bc9 (diff)
download	sandcrawler-2469d483326e5e81e774e46fd100888710b9bbc3.tar.gz sandcrawler-2469d483326e5e81e774e46fd100888710b9bbc3.zip