10 files changed, 1397 insertions, 0 deletions
diff --git a/notes/tasks/2020-07-22_processing_holes.md b/notes/tasks/2020-07-22_processing_holes.md
new file mode 100644
index 0000000..70e2b59
--- /dev/null
+++ b/notes/tasks/2020-07-22_processing_holes.md
@@ -0,0 +1,120 @@
+
+Want to clean up missing/partial processing (GROBID, `pdf_meta`, `file_meta`)
+in sandcrawler database.
+
+
+## `pdf_meta` for petabox rows
+
+Ran `dump_unextracted_pdf_petabox.sql` SQL, which resulted in a .json file.
+
+    wc -l dump_unextracted_pdf_petabox.2020-07-22.json
+    1503086 dump_unextracted_pdf_petabox.2020-07-22.json
+
+Great, 1.5 million, not too many. Start small:
+
+    head -n1000 dump_unextracted_pdf_petabox.2020-07-22.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.unextracted -p -1
+
+Full batch:
+
+    cat dump_unextracted_pdf_petabox.2020-07-22.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.unextracted -p -1
+
+Re-ran on 2020-08-19:
+
+    wc -l dump_unextracted_pdf_petabox.2020-08-19.json 
+    971194 dump_unextracted_pdf_petabox.2020-08-19.json
+
+## `pdf_meta` missing CDX rows
+
+First, the GROBID-ized rows but only if has a fatcat file as well.
+
+10,755,365! That is a lot still to process.
+
+    cat dump_unextracted_pdf.fatcat.2020-07-22.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.unextracted -p -1
+
+Re-ran on 2020-08-19:
+
+    wc -l dump_unextracted_pdf.fatcat.2020-08-19.json 
+    65517 dump_unextracted_pdf.fatcat.2020-08-19.json
+
+Enqueued!
+
+## `GROBID` missing petabox rows
+
+    wc -l /grande/snapshots/dump_ungrobided_pdf_petabox.2020-07-22.json 
+    972221 /grande/snapshots/dump_ungrobided_pdf_petabox.2020-07-22.json
+
+Start small:
+
+    head -n1000 dump_ungrobided_pdf_petabox.2020-07-22.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1
+
+Full batch:
+
+    cat dump_ungrobided_pdf_petabox.2020-07-22.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1
+
+Re-ran on 2020-08-19:
+
+    wc -l dump_ungrobided_pdf_petabox.2020-08-19.json 
+    933 dump_ungrobided_pdf_petabox.2020-08-19.json
+
+Enqueued!
+
+## `GROBID` for missing CDX rows in fatcat
+
+    wc -l dump_ungrobided_pdf.fatcat.2020-07-22.json
+    1808580 dump_ungrobided_pdf.fatcat.2020-07-22.json
+
+Full batch:
+
+    cat dump_ungrobided_pdf.fatcat.2020-07-22.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1
+
+## `GROBID` for bad status
+
+Eg, wayback errors.
+
+TODO
+
+## `pdf_trio` for OA journal crawls
+
+TODO
+
+## `pdf_trio` for "included by heuristic", not in fatcat
+
+TODO
+
+## Live-ingest missing arxiv papers
+
+    ./fatcat_ingest.py --allow-non-oa --limit 10000 query arxiv_id:* > /srv/fatcat/snapshots/arxiv_10k_ingest_requests.json
+    => Expecting 1505184 release objects in search queries
+
+    cat /srv/fatcat/snapshots/arxiv_10k_ingest_requests.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p 22
+
+Repeating this every few days should (?) result in all the backlog of arxiv
+papers getting indexed. Could focus on recent years to start (with query
+filter).
+
+## re-ingest spn2 errors (all time)
+
+Eg:
+
+    spn2-cdx-lookup-failure: 143963
+    spn-error: 101773
+    spn2-error: 16342
+
+TODO
+
+## re-try CDX errors
+
+Eg, for unpaywall only, bulk ingest all `cdx-error`.
+
+TODO
+
+## live ingest unpaywall `no-capture` URLs
+
+After re-trying the CDX errors for unpaywall URLs (see above), count all the
+no-capture URLs, and if reasonable recrawl them all in live more ("reasonable"
+meaning fewer than 200k or so URLs).
+
+Could also force recrawl (not using CDX lookups) for some publisher platforms
+if that made sense.
+
+TODO
diff --git a/notes/tasks/2020-08-20_file_meta.md b/notes/tasks/2020-08-20_file_meta.md
new file mode 100644
index 0000000..39c84dd
--- /dev/null
+++ b/notes/tasks/2020-08-20_file_meta.md
@@ -0,0 +1,66 @@
+
+Want to update fatcat file entities with "full" file metadata for those which are missing it.
+
+How many `file_meta` rows *still* don't have metadata?
+
+    SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL;
+    => 62962
+
+First generate list of sha1hex from most recent bulk export which are missing
+at least some metadata (based on missing sha256):
+
+    zcat file_hashes.tsv.gz | rg '\t\t' | cut -f3 | sort -u -S 4G | pv -l > fatcat_file_partial_sha1hex.tsv
+    => 18.7M 0:05:46 [53.8k/s]
+
+Then dump the entire sandcrawler `file_meta` table as TSV, with first column
+sha1hex and second column JSON with all the file metadata fields:
+
+    COPY (
+      SELECT sha1hex, row_to_json(file_meta)
+      FROM file_meta
+      WHERE sha256hex IS NOT NULL
+      ORDER BY sha1hex ASC
+    )
+    TO '/grande/snapshots/file_meta_dump.tsv'
+    WITH NULL '';
+
+Join/cut:
+
+    export LC_ALL=C
+    join -t$'\t' fatcat_file_partial_sha1hex.tsv /grande/snapshots/file_meta_dump.tsv | uniq -w 40 | cut -f2 | pv -l > fatcat_file_partial.file_meta.json
+    => 18.1M 0:03:37 [83.2k/s]
+
+Check counts:
+
+    cat fatcat_file_partial.file_meta.json | jq .sha1hex -r | sort -u -S 4G | wc -l
+    => 18135313
+
+    zcat fatcat_file_partial.file_meta.json.gz | jq .mimetype -r | sort -S 4G | uniq -c | sort -nr
+    18103860 application/pdf
+      29977 application/octet-stream
+        876 text/html
+        199 application/postscript
+        171 application/gzip
+         84 text/plain
+         48 application/xml
+         38 application/vnd.ms-powerpoint
+         16 application/msword
+          8 application/vnd.openxmlformats-officedocument.wordprocessingml.document
+          6 image/jpeg
+          4 message/rfc822
+          4 application/zip
+          4 application/vnd.openxmlformats-officedocument.presentationml.presentation
+          3 text/x-tex
+          3 application/x-dosexec
+          2 application/x-tar
+          2 application/vnd.ms-tnef
+          1 video/mpeg
+          1 image/tiff
+          1 image/svg+xml
+          1 image/png
+          1 image/gif
+          1 audio/x-ape
+          1 application/vnd.ms-office
+          1 application/CDFV2-unknown
+
+TODO: fatcat importer
diff --git a/notes/tasks/2020-10-21_pdfextract_holes.md b/notes/tasks/2020-10-21_pdfextract_holes.md
new file mode 100644
index 0000000..c0bb65e
--- /dev/null
+++ b/notes/tasks/2020-10-21_pdfextract_holes.md
@@ -0,0 +1,74 @@
+
+Realized I had not enabled persisting of PDF extraction results (thumbnail,
+text) in ingest worker when added over the summer. So now need to run a
+catch-up. This applied to both "live" and "bulk" ingest.
+
+## `cdx` / `ingest` / `grobid` catch-up
+
+First, re-run extraction for cases where we did an ingest, and grobid ran
+successfully, and we have a CDX row, but no `pdf_meta`:
+
+    -- this is a slow query
+    COPY (
+      SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx)
+      FROM grobid
+      LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex
+      --LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex
+      LEFT JOIN ingest_file_result ON grobid.sha1hex = ingest_file_result.terminal_sha1hex
+      LEFT JOIN pdf_meta ON grobid.sha1hex = pdf_meta.sha1hex
+      WHERE cdx.sha1hex IS NOT NULL
+        --AND fatcat_file.sha1hex IS NOT NULL
+        AND ingest_file_result.terminal_sha1hex IS NOT NULL
+        AND pdf_meta.sha1hex IS NULL
+    )
+    TO '/grande/snapshots/dump_unextracted_pdf.ingest.2020-10-21.json'
+    WITH NULL '';
+    => 19,676,116
+
+Wow, that is a lot. Many from recent OAI-PMH and OA crawls, presumably.
+
+    cat /grande/snapshots/dump_unextracted_pdf.ingest.2020-10-21.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.unextracted -p -1
+
+And again, after a couple partitions got hung up:
+
+    COPY (
+      SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx)
+      FROM grobid
+      LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex
+      --LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex
+      LEFT JOIN ingest_file_result ON grobid.sha1hex = ingest_file_result.terminal_sha1hex
+      LEFT JOIN pdf_meta ON grobid.sha1hex = pdf_meta.sha1hex
+      WHERE cdx.sha1hex IS NOT NULL
+        --AND fatcat_file.sha1hex IS NOT NULL
+        AND ingest_file_result.terminal_sha1hex IS NOT NULL
+        AND pdf_meta.sha1hex IS NULL
+    )
+    TO '/grande/snapshots/dump_unextracted_pdf.ingest.2020-11-04.json'
+    WITH NULL '';
+
+
+    cat /grande/snapshots/dump_unextracted_pdf.ingest.2020-11-04.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.unextracted -p -1
+    => 562k 0:00:16 [34.6k/s]
+
+## `petabox` / `grobid` catch-up
+
+These didn't all seem to extract correctly before after 1.5m rows, there will
+still 900k unprocessed. Trying again.
+
+    COPY (
+      SELECT DISTINCT ON (petabox.sha1hex) row_to_json(petabox)
+      FROM grobid
+      LEFT JOIN petabox ON grobid.sha1hex = petabox.sha1hex
+      LEFT JOIN pdf_meta ON grobid.sha1hex = pdf_meta.sha1hex
+      WHERE petabox.sha1hex IS NOT NULL
+        AND pdf_meta.sha1hex IS NULL
+    )
+    TO '/grande/snapshots/dump_unextracted_pdf_petabox.2020-11-04.json'
+    WITH NULL '';
+
+    cat /grande/snapshots/dump_unextracted_pdf_petabox.ingest.2020-11-04.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.unextracted -p -1
+
+## `cdx` / `grobid` catch-up
+
+Next will be to process PDFs with GROBID and CDX but no ingest.
+
diff --git a/notes/tasks/2021-09-09_pdf_url_lists.md b/notes/tasks/2021-09-09_pdf_url_lists.md
new file mode 100644
index 0000000..cd8176e
--- /dev/null
+++ b/notes/tasks/2021-09-09_pdf_url_lists.md
@@ -0,0 +1,70 @@
+
+Want to dump a URL list to share with partners, filtered to content we think is
+likely to be scholarly.
+
+Columns to include:
+
+- original URL
+- capture timestamp
+- SHA1
+
+## Stats Overview
+
+file_meta table, mimetype=application/pdf: 173,816,433
+
+cdx table, mimetype=application/pdf: 131,346,703
+
+ingest_file_result table, pdf, success: 66,487,928
+
+## Ingested PDF URLs
+
+"Ingested" URLs: ingest_file_result table, pdf and hit=true; include base URL also?
+
+    COPY (
+        SELECT
+            base_url as start_url,
+            terminal_url as pdf_url,
+            terminal_dt as pdf_url_timestamp,
+            terminal_sha1hex as pdf_sha1hex
+        FROM ingest_file_result
+        WHERE
+            ingest_type = 'pdf'
+            AND status = 'success'
+    )
+    TO '/srv/sandcrawler/tasks/wayback_pdf_targeted.2021-09-09.tsv'
+    WITH NULL '';
+    => 77,892,849
+
+## CDX PDFs
+
+"All web PDFs": CDX query; left join file_meta, but don't require
+
+    COPY (
+        SELECT
+            cdx.url as pdf_url,
+            cdx.datetime as pdf_url_timestamp,
+            cdx.sha1hex as pdf_sha1hex
+        FROM cdx
+        LEFT JOIN file_meta
+        ON
+            cdx.sha1hex = file_meta.sha1hex
+        WHERE
+            file_meta.mimetype = 'application/pdf'
+            OR (
+                file_meta.mimetype IS NULL
+                AND cdx.mimetype = 'application/pdf'
+            )
+    )
+    TO '/srv/sandcrawler/tasks/wayback_pdf_speculative.2021-09-09.tsv'
+    WITH NULL '';
+    => 147,837,935
+
+## Processed web PDFs
+
+"Parsed web PDFs": `file_meta`, left join CDX
+
+(didn't do this one)
+
+---
+
+Uploaded all these to <https://archive.org/download/ia_scholarly_urls_2021-09-09>
diff --git a/notes/tasks/2021-10-29_crossref_refs_backfill.md b/notes/tasks/2021-10-29_crossref_refs_backfill.md
new file mode 100644
index 0000000..94eefec
--- /dev/null
+++ b/notes/tasks/2021-10-29_crossref_refs_backfill.md
@@ -0,0 +1,235 @@
+
+The current sandcrawler-db crossref table was backfilled from a 2021-01
+snapshot, and has not been updated since.
+
+Would like to use the existing fatcat Kafka feed to keep the crossref table up
+to date, and also backfill in GROBID reference parsing of all `unstructured`
+references.
+
+Current plan is:
+
+1. use kafkacat CLI to dump crossref Kafka topic, from the begining of 2021 up
+   to some recent date
+2. use `persist_tool.py`, with a large batch size (200?) to backfill this dump
+   into sandcrawler-db. this will update some rows multiple times (if there
+   have been updates)
+3. dump the full crossref table, as a point-in-time snapshot
+4. filter to crossref records that have `unstrutured` references in them (at
+   all)
+5. use `grobid_tool.py` with `parallel` to batch process references
+6. backfill these refs using a simple SQL COPY statement
+7. deploy crossref persist worker, with ref updates on, and roll the consumer
+   group back to date of dump
+8. wait for everything to catch up
+
+
+## Commands
+
+Get a timestamp in milliseconds:
+
+    2021-01-01 is:
+        1609488000 in unix time (seconds)
+        1609488000000 in miliseconds
+
+Hrm, oldest messages seem to actually be from 2021-04-28T19:21:10Z though. Due
+to topic compaction? Yup, we have a 180 day compaction policy on that topic,
+probably from when kafka space was tight. Oh well!
+
+Updated retention for this topic to `46656000000` (~540 days, ~18 months) using
+`kafka-manager` web app.
+
+    kafkacat -C -b wbgrp-svc263.us.archive.org -t fatcat-prod.api-crossref -o s@1609488000000 \
+        | pv -l \
+        | gzip \
+        > crossref_feed_start20210428_end20211029.json.gz
+
+This resulted in ~36 million rows, 46GB.
+
+`scp` that around, then run persist on `sandcrawler-db`:
+
+    # in pipenv, as sandcrawler user
+    # manually edited to set batch size to 200
+    zcat /srv/sandcrawler/tasks/crossref_feed_start20210428_end20211029.json.gz \
+        | pv -l \
+        | ./persist_tool.py crossref -
+    => 36.8M 11:02:43 [ 925 /s]
+
+With a single thread, the persist process runs at about 1,000 rows/sec, which
+works out to about 10 hours for 36 million rows.
+
+At the start of this process, total PostgreSQL database size is 832.21G. At the
+end, 902.51G. Have not run a `VACUUM ALL` or anything like that.
+
+Query to dump crossref rows which have any refs and compress output with pigz:
+
+    # dump_crossref.sql
+    COPY (
+        SELECT record
+        FROM crossref
+        WHERE record::jsonb @? '$.reference[*].unstructured'
+        -- LIMIT 5
+    )
+    TO STDOUT
+    WITH NULL '';
+
+    # 'sed' required because of double quote escaping in postgresql output::
+    # https://stackoverflow.com/questions/29869983/postgres-row-to-json-produces-invalid-json-with-double-escaped-quotes/29871069
+    # 'rg' filter is just being conservative
+
+    # XXX: next time add to the pipeline: rg -v "\\\\"
+    # or, find some way to filter/transform this kind of SQL export better?
+    psql sandcrawler < dump_crossref.sql \
+        | sed 's/\\"/\"/g' \
+        | rg '^\{' \
+        | pv -l \
+        | pigz \
+        > /srv/sandcrawler/tasks/crossref_sandcrawler_unstructured.json.gz
+    => 26.1M 3:22:51 [2.15k/s]
+
+    # NOTE: -j40 is for production run with ~dedicated GROBID server with many cores
+    zcat /srv/sandcrawler/tasks/crossref_sandcrawler_unstructured.json.gz \
+        | rg -v "\\\\" \
+        | parallel -j35 --linebuffer --round-robin --pipe ./grobid_tool.py --grobid-host http://wbgrp-svc096.us.archive.org:8070 parse-crossref-refs - \
+        | pv -l \
+        | pigz \
+        > /srv/sandcrawler/tasks/crossref_sandcrawler_unstructured.grobid_refs.json.gz
+
+    # from earlier testing with -j40: able to do about 300-500 records/second
+    # 23.9k 0:01:14 [ 320 /s]
+    # 134518 total refs parsed
+    # ~1817 refs/second parsed
+
+    # with errors, got through about: 2.08M 1:38:20 [ 352 /s]
+    # was still seing bad JSON?
+    # JSON lines pushed: Counter({'total': 105898, 'pushed': 105886, 'error-json-decode': 12})
+
+    # finally, without errors:
+    # 18.6M 8:35:02 [ 603 /s]
+
+In the next step, going to need a small direct persist worker to copy lines
+verbatim into just the `grobid_refs` table.
+
+## Errors
+
+Got errors when running for real:
+
+    xml.etree.ElementTree.ParseError: not well-formed (invalid token): line 114, column 33
+
+    requests.exceptions.HTTPError: 500 Server Error: Internal Server Error for url: http://wbgrp-svc096.us.archive.org:8070/api/processCitationList
+
+    urllib3.exceptions.MaxRetryError: HTTPConnectionPool(host='wbgrp-svc096.us.archive.org', port=8070): Max retries exceeded with url: /api/processCitationList (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f54b0a3bd00>: Failed to establish a new connection: [Errno 99] Cannot assign requested address'))
+
+
+    Nov 03 06:57:32 wbgrp-svc096.us.archive.org GROBID[400404]: ERROR [2021-11-03 06:57:32,569] org.grobid.service.process.GrobidRestProcessString: An unexpected exception occurs.
+    Nov 03 06:57:32 wbgrp-svc096.us.archive.org GROBID[400404]: ! java.lang.NullPointerException: null
+    Nov 03 06:57:32 wbgrp-svc096.us.archive.org GROBID[400404]: ! at org.grobid.core.data.BiblioItem.cleanTitles(BiblioItem.java:1784)
+    Nov 03 06:57:32 wbgrp-svc096.us.archive.org GROBID[400404]: ! at org.grobid.core.engines.CitationParser.processingLayoutTokenMultiple(CitationParser.java:175)
+    Nov 03 06:57:32 wbgrp-svc096.us.archive.org GROBID[400404]: ! at org.grobid.core.engines.CitationParser.processingStringMultiple(CitationParser.java:92)
+    Nov 03 06:57:32 wbgrp-svc096.us.archive.org GROBID[400404]: ! at org.grobid.core.engines.Engine.processRawReferences(Engine.java:168)
+    Nov 03 06:57:32 wbgrp-svc096.us.archive.org GROBID[400404]: ! at org.grobid.service.process.GrobidRestProcessString.processCitationList(GrobidRestProcessString.java:316)
+    Nov 03 06:57:32 wbgrp-svc096.us.archive.org GROBID[400404]: ! at org.grobid.service.GrobidRestService.processCitationListReturnXml_post(GrobidRestService.java:581)
+    Nov 03 06:57:32 wbgrp-svc096.us.archive.org GROBID[400404]: ! at sun.reflect.GeneratedMethodAccessor19.invoke(Unknown Source)
+    Nov 03 06:57:32 wbgrp-svc096.us.archive.org GROBID[400404]: ! at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
+    Nov 03 06:57:32 wbgrp-svc096.us.archive.org GROBID[400404]: ! at java.lang.reflect.Method.invoke(Method.java:498)
+    [...]
+
+Bogus example reference causing 500 error (among other non-error citations) (doi:10.5817/cz.muni.m210-9541-2019):
+
+    'Müller, R., Šidák, P. (2012). Slovník novější literární teorie. Praha: Academia.'
+    '\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0'
+    'Šotkovská, J. (2008). Rané divadelní hry Milana Uhdeho; diplomová práce. Brno: Masarykova univerzita.',
+
+s.strip() in python would remove these non-breaking spaces (update: implemented this later)
+
+    Maheswari, S., Vijayalakshmi, C.: Optimization Model for Electricity Distribution System Control using Communication System by La-grangian Relaxation Technique. CiiT International Journal of Wireless Communication 3(3), 183–187 (2011) (Print: ISSN 0974 – 9756 & Online: ISSN 0974 – 9640)
+
+Also:
+
+    truncating very large reference list for doi:10.1017/chol9780521264303.033 len:2281
+    truncating very large reference list for doi:10.1017/chol9780521263351.011 len:3129
+    truncating very large reference list for doi:10.1017/chol9780521263351.022 len:2968
+    truncating very large reference list for doi:10.1017/chol9780521264303.036 len:2221
+    truncating very large reference list for doi:10.1017/chol9780521264303.007 len:2238
+    truncating very large reference list for doi:10.1017/chol9780521086912.001 len:2177
+    truncating very large reference list for doi:10.1017/chol9780521228046.002 len:2133
+    truncating very large reference list for doi:10.1017/chol9780521264303.035 len:2221
+    truncating very large reference list for doi:10.1017/chol9780521264303.002 len:2279
+
+Seems like bumping to 2500 as the maximum reference list size might be
+reasonable (it is 2000 currently).
+
+After some refactoring, still getting:
+
+    requests.exceptions.ConnectionError
+
+This is because I am doing POST without a session.
+
+Then, still got requests.exceptions.ReadTimeout
+
+Finally, got through the whole batch, (`18.6M 8:35:02 [ 603 /s]` output), with
+only a few dozen rows like:
+
+    GROBID returned bad XML for Crossref DOI: 10.1007/978-3-030-03008-7_21-1
+    GROBID HTTP timeout for Crossref DOI: 10.1007/978-1-4757-1496-8_3
+    GROBID HTTP timeout for Crossref DOI: 10.1007/978-1-4757-1493-7_3
+    GROBID returned bad XML for Crossref DOI: 10.1007/978-3-319-96184-2_2
+    GROBID returned bad XML for Crossref DOI: 10.1063/1.5031970
+    truncating very large reference list for doi:10.1007/978-1-4757-1499-9_15 len:11401
+    GROBID returned bad XML for Crossref DOI: 10.1016/j.oraloncology.2019.104562
+    GROBID returned bad XML for Crossref DOI: 10.1016/j.pec.2020.04.010
+
+So things seem to be working!
+
+Summary lines looked like:
+
+    JSON lines pushed: Counter({'total': 531487, 'pushed': 531487})
+    Worker: Counter({'total': 536541, 'failed': 3})
+
+Failures per batch were on the order of 0 to 3.
+
+## Postgres Backfill
+
+Start with a sample:
+
+    zcat /srv/sandcrawler/tasks/crossref_sandcrawler_unstructured.grobid_refs.json.gz \
+        | head -n1000 \
+        | ./persist_tool.py grobid-refs -
+    # Worker: Counter({'total': 1000, 'insert-grobid_refs': 1000, 'update-grobid_refs': 0})
+
+    # same command again:
+    # Worker: Counter({'total': 1000, 'update-grobid_refs': 1000, 'insert-grobid_refs': 0})
+
+Example DOIs:
+
+    # no refs
+    10.1007/978-1-349-04135-0_3
+    http get :3030/crossref_with_refs "doi==eq.10.1007/978-1-349-04135-0_3"
+
+    # with refs
+    10.1007/978-1-349-03594-6_2
+    http get :3030/crossref_with_refs "doi==eq.10.1007/978-1-349-03594-6_2"
+
+Seems to be working, so will do the full backfill. Can check table sizes on a
+per-table basis when complete.
+
+    zcat /srv/sandcrawler/tasks/crossref_sandcrawler_unstructured.grobid_refs.json.gz \
+        | pv -l \
+        | ./persist_tool.py grobid-refs -
+    # Worker: Counter({'total': 18646668, 'insert-grobid_refs': 18639195, 'update-grobid_refs': 7473})
+
+
+## Kafka Setup
+
+Added ansible config and deployed persist-crossref worker.
+
+First roll-back just a couple days as a test:
+
+    ./kafka-consumer-groups.sh --bootstrap-server localhost:9092 --group persist-crossref --reset-offsets --topic fatcat-prod.api-crossref --to-datetime 2021-11-07T00:00:00.000
+
+    # eg: Import counts: Counter({'total': 372350, 'insert-grobid_refs': 326987, 'update-crossref': 265581, 'insert-crossref': 106769, 'update-grobid_refs': 45362, 'skip': 1})
+
+Then roll-back to before the snapshot and backfill, to catch up:
+
+    ./kafka-consumer-groups.sh --bootstrap-server localhost:9092 --group persist-crossref --reset-offsets --topic fatcat-prod.api-crossref --to-datetime 2021-10-26T00:00:00.000
+
+Ran this last command on 2021-11-10, and total lag was around 2,566,741.
diff --git a/notes/tasks/2021-12-06_regrobid.md b/notes/tasks/2021-12-06_regrobid.md
new file mode 100644
index 0000000..5fb69d1
--- /dev/null
+++ b/notes/tasks/2021-12-06_regrobid.md
@@ -0,0 +1,380 @@
+
+Want to test recent updates of GROBID (to fix regex issue), and also re-process
+a number of PDFs which failed to process with GROBID initially.
+
+
+## HTTP 503
+
+These are attempts which failed because GROBID was too busy or not running.
+
+    # IMPROVED BELOW
+    COPY (
+        SELECT row_to_json(cdx)
+        FROM grobid
+        LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex
+        WHERE
+            grobid.status_code = 503
+            AND cdx.sha1hex IS NOT NULL
+        -- LIMIT 5;
+    )
+    TO '/srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-06.grobid503.json'
+    WITH NULL '';
+    # COPY 4749
+
+Not actually that many, which seems good. Confirm that these are uniq by sha1hex:
+
+    cat ungrobided_fatcat.2021-12-06.grobid503.json | jq .sha1hex -r | sort | uniq -d | wc -l
+    # 302
+
+Nope! Need to add "distinct on":
+
+    COPY (
+        SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx)
+        FROM grobid
+        LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex
+        WHERE
+            grobid.status_code = 503
+            AND cdx.sha1hex IS NOT NULL
+        -- LIMIT 5;
+    )
+    TO '/srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-06.grobid503.json'
+    WITH NULL '';
+    # COPY 4297
+
+    cat /srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-06.grobid503.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1
+
+## Never Processed CDX
+
+PDFs in fatcat which have never been processed with GROBID.
+
+    COPY (
+        SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx)
+        FROM fatcat_file
+        LEFT JOIN cdx ON fatcat_file.sha1hex = cdx.sha1hex
+        LEFT JOIN grobid ON grobid.sha1hex = fatcat_file.sha1hex
+        LEFT JOIN file_meta ON file_meta.sha1hex = fatcat_file.sha1hex
+        WHERE
+            grobid.sha1hex IS NULL
+            AND cdx.sha1hex IS NOT NULL
+            AND (file_meta.mimetype = 'application/pdf' OR file_meta.mimetype IS NULL)
+        -- LIMIT 5;
+    )
+    TO '/srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-06.cdx.json'
+    WITH NULL '';
+    # COPY 15488
+
+    cat /srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-06.cdx.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1
+
+
+PDFs in fatcat which have never been processed with pdfextract.
+
+    # TODO
+    COPY (
+        SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx)
+        FROM fatcat_file
+        LEFT JOIN cdx ON fatcat_file.sha1hex = cdx.sha1hex
+        LEFT JOIN pdf_meta ON pdf_meta.sha1hex = fatcat_file.sha1hex
+        LEFT JOIN file_meta ON file_meta.sha1hex = fatcat_file.sha1hex
+        WHERE
+            pdf_meta.sha1hex IS NULL
+            AND cdx.sha1hex IS NOT NULL
+            AND cdx.mimetype = 'application/pdf'
+            AND (file_meta.mimetype = 'application/pdf' OR file_meta.mimetype IS NULL)
+        -- LIMIT 5;
+    )
+    TO '/srv/sandcrawler/tasks/unextracted_fatcat.2021-12-08.cdx.json'
+    WITH NULL '';
+    # COPY 45535
+
+    cat /srv/sandcrawler/tasks/unextracted_fatcat.2021-12-08.cdx.json \
+        | rg -v "\\\\" \
+        | jq . -c \
+        | pv -l \
+        | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.unextracted -p -1
+    # 45.5k 0:00:01 [30.2k/s]
+
+## Timeout or Failure
+
+    COPY (
+        SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx)
+        FROM grobid
+        LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex
+        LEFT JOIN file_meta ON grobid.sha1hex = file_meta.sha1hex
+        WHERE
+            (grobid.status_code = 500 OR grobid.status_code = -4)
+            AND cdx.sha1hex IS NOT NULL
+            AND file_meta.mimetype  = 'application/pdf'
+        -- LIMIT 5;
+    )
+    TO '/srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-06.grobid_failed.json'
+    WITH NULL '';
+    # COPY 8,084,296
+
+    cat /srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-06.grobid_failed.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1
+
+This seems to not be working very well, mostly errors, empty docs, etc. Will
+roll-forward the kafka consumer group after attempting a couple hundred
+thousand of these.
+
+Let's try limiting to files actually in fatcat:
+
+    COPY (
+        SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx)
+        FROM grobid
+        LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex
+        LEFT JOIN file_meta ON grobid.sha1hex = file_meta.sha1hex
+        LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex
+        WHERE
+            (grobid.status_code = 500 OR grobid.status_code = -4)
+            AND cdx.sha1hex IS NOT NULL
+            AND fatcat_file.sha1hex IS NOT NULL
+            AND file_meta.mimetype  = 'application/pdf'
+            -- sort of arbitary "not recently" date filter
+            AND (grobid.updated IS NULL OR grobid.updated < '2021-11-15')
+        -- LIMIT 5;
+    )
+    TO '/srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-08.grobid_failed.json'
+    WITH NULL '';
+    # COPY 529265
+
+That is a much more managable batch to retry.
+
+    cat /srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-08.grobid_failed.json \
+        | rg -v "\\\\" \
+        | jq . -c \
+        | pv -l \
+        | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1
+    # 529k 0:00:17 [31.0k/s]
+
+
+## Missing Fatcat Files
+
+There were around a half million fatcat file entities which didn't have `cdx`
+rows in sandcrawler. Did some specific pdfextract processing; now we should do
+GROBID ingest as well.
+
+Enque the `CDX` objects for GROBID and pdfextract processing:
+
+    zcat /schnell/fatcat_cleanups/file_meta/files_missing_sha256.cdx_rows.json.gz \
+        | rg -v "\\\\" \
+        | jq . -c \
+        | pv -l \
+        | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1
+    # 354k 0:00:11 [30.6k/s]
+
+    zcat /schnell/fatcat_cleanups/file_meta/files_missing_sha256.cdx_rows.json.gz \
+        | rg -v "\\\\" \
+        | jq . -c \
+        | pv -l \
+        | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.unextracted -p -1
+
+And some earlier files of interest on `aitio`:
+
+    cat files_missing_sha256.ingest_results.json \
+        | rg '"application/pdf"' \
+        | rg -v "\\\\" \
+        | jq .cdx -c \
+        | sort -u -S 4G \
+        | pv -l \
+        > files_missing_sha256.cdx.uniq.json
+    # 100k 0:00:47 [2.09k/s]
+
+    cat files_missing_sha256.cdx.uniq.json \
+        | rg -v "\\\\" \
+        | jq . -c \
+        | pv -l \
+        | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1
+
+    cat files_missing_sha256.cdx.uniq.json \
+        | rg -v "\\\\" \
+        | jq . -c \
+        | pv -l \
+        | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.unextracted -p -1
+
+
+## Ancient Fatcat Files
+
+Files from an era where we didn't record GROBID version or status, even for
+success.
+
+    COPY (
+        SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx)
+        FROM grobid
+        LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex
+        LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex
+        WHERE
+            grobid.status_code = 200
+            AND grobid.status IS NULL
+            AND cdx.sha1hex IS NOT NULL
+            AND fatcat_file.sha1hex IS NOT NULL
+            -- sort of arbitary "not recently" date filter
+            AND (grobid.updated IS NULL OR grobid.updated < '2021-11-15')
+        -- LIMIT 5;
+    )
+    TO '/srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-11.grobid_status_null.json'
+    WITH NULL '';
+
+    cat /srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-11.grobid_status_null.json \
+        | rg -v "\\\\" \
+        | jq . -c \
+        | pv -l \
+        | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1
+    # 107k 0:00:03 [29.9k/s]
+
+
+## Start Re-Processing Old GROBID Versions
+
+    COPY (
+        SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx)
+        FROM grobid
+        LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex
+        LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex
+        WHERE
+            grobid.status = 'success'
+            AND grobid.grobid_version NOT LIKE '0.7.%'
+            AND cdx.sha1hex IS NOT NULL
+            AND fatcat_file.sha1hex IS NOT NULL
+            -- sort of arbitary "not recently" date filter
+            AND (grobid.updated IS NULL OR grobid.updated < '2021-11-15')
+        -- LIMIT 5;
+    )
+    TO '/srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-11.grobid_old.json'
+    WITH NULL '';
+
+This one is huge, and want to process in batches/chunks of ~8 million at a time.
+
+    cd /srv/sandcrawler/tasks/
+    cat /srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-11.grobid_old.json \
+        | split --lines 5000000 - ungrobided_fatcat.2021-12-11.grobid_old.split_ -d --additional-suffix .json
+
+Submit individual batches like:
+
+    cat /srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-11.grobid_old.split_01.json \
+        | rg -v "\\\\" \
+        | jq . -c \
+        | pv -l \
+        | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1
+
+Overall progress:
+
+    x  ungrobided_fatcat.2021-12-11.grobid_old.split_00.json
+    x  ungrobided_fatcat.2021-12-11.grobid_old.split_01.json
+    x  ungrobided_fatcat.2021-12-11.grobid_old.split_02.json
+    x  ungrobided_fatcat.2021-12-11.grobid_old.split_03.json
+    x  ungrobided_fatcat.2021-12-11.grobid_old.split_04.json
+    x  ungrobided_fatcat.2021-12-11.grobid_old.split_05.json
+    x  ungrobided_fatcat.2021-12-11.grobid_old.split_06.json
+    x  ungrobided_fatcat.2021-12-11.grobid_old.split_07.json
+    x  ungrobided_fatcat.2021-12-11.grobid_old.split_08.json (small)
+
+This finally finished on 2022-04-26. Horray!
+
+## General Counts
+
+How many fatcat files of what mimetype (reported in sandcrawler-db)?
+
+    SELECT file_meta.mimetype, COUNT(*)
+    FROM fatcat_file
+    LEFT JOIN file_meta ON fatcat_file.sha1hex = file_meta.sha1hex
+    WHERE
+        fatcat_file.first_release_ident IS NOT NULL
+        AND fatcat_file.any_url = true
+        AND content_scope IS NULL
+    GROUP BY file_meta.mimetype
+    ORDER BY COUNT(*) DESC
+    LIMIT 25;
+
+                                     mimetype                                  |  count
+    ---------------------------------------------------------------------------+----------
+     application/pdf                                                           | 45227033
+                                                                               |   433068
+     application/octet-stream                                                  |    30634
+     application/jats+xml                                                      |     6874
+     text/html                                                                 |      876
+     application/postscript                                                    |      199
+     application/gzip                                                          |      173
+     text/plain                                                                |       84
+     application/xml                                                           |       48
+     application/vnd.ms-powerpoint                                             |       38
+     application/msword                                                        |       16
+     application/vnd.openxmlformats-officedocument.wordprocessingml.document   |        8
+     image/jpeg                                                                |        6
+     application/vnd.openxmlformats-officedocument.presentationml.presentation |        4
+     message/rfc822                                                            |        4
+     application/zip                                                           |        4
+     text/x-tex                                                                |        3
+     application/x-dosexec                                                     |        3
+     application/x-tar                                                         |        2
+     application/vnd.ms-tnef                                                   |        2
+     image/svg+xml                                                             |        1
+     image/tiff                                                                |        1
+     image/png                                                                 |        1
+     image/gif                                                                 |        1
+     application/vnd.ms-office                                                 |        1
+    (25 rows)
+
+
+PDF extract status?
+
+    SELECT pdf_meta.status, COUNT(*)
+    FROM fatcat_file
+    LEFT JOIN pdf_meta ON fatcat_file.sha1hex = pdf_meta.sha1hex
+    WHERE
+        fatcat_file.first_release_ident IS NOT NULL
+        AND fatcat_file.any_url = true
+        AND content_scope IS NULL
+    GROUP BY pdf_meta.status
+    ORDER BY COUNT(*) DESC
+    LIMIT 25;
+
+         status     |  count
+    ----------------+----------
+     success        | 43415920
+                    |  2018522
+     text-too-large |   122730
+     parse-error    |    94876
+     not-pdf        |    32156
+     error-wayback  |    14504
+     bad-unicode    |      279
+     bad-pdf        |       98
+     empty-blob     |        2
+    (9 rows)
+
+
+What are the GROBID status codes for fatcat files? Narrowed down:
+
+    SELECT grobid.status, grobid.status_code, COUNT(*)
+    FROM fatcat_file
+    LEFT JOIN grobid ON fatcat_file.sha1hex = grobid.sha1hex
+    WHERE
+        fatcat_file.first_release_ident IS NOT NULL
+        AND fatcat_file.any_url = true
+        AND content_scope IS NULL
+    GROUP BY grobid.status, grobid.status_code
+    ORDER BY COUNT(*) DESC
+    LIMIT 25;
+
+         status     | status_code |  count
+    ----------------+-------------+----------
+     success        |         200 | 44409069
+     error          |         500 |   580402
+                    |             |   468836
+                    |         200 |   240660
+     error-timeout  |          -4 |       79
+     bad-grobid-xml |         200 |       38
+     error          |         200 |        3
+    (7 rows)
+
+Ran the same query again on 2021-12-15:
+
+         status     | status_code |  count   
+    ----------------+-------------+----------
+     success        |         200 | 45092915
+     error          |         500 |   302373
+                    |             |   250335
+                    |         200 |    53352
+     bad-grobid-xml |         200 |       39
+     error-timeout  |          -4 |       37
+     error          |         200 |       34
+     error          |         503 |        2
+    (8 rows)
diff --git a/notes/tasks/2022-01-07_grobid_platform_pdfs.md b/notes/tasks/2022-01-07_grobid_platform_pdfs.md
new file mode 100644
index 0000000..b5422c2
--- /dev/null
+++ b/notes/tasks/2022-01-07_grobid_platform_pdfs.md
@@ -0,0 +1,23 @@
+
+Martin crawled more than 10 million new PDFs from various platform domains. We
+should get these processed and included in sandcrawler-db.
+
+## Select CDX Rows
+
+    COPY (
+        SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx)
+        FROM cdx
+        LEFT JOIN grobid ON grobid.sha1hex = cdx.sha1hex
+        WHERE
+            grobid.sha1hex IS NULL
+            AND cdx.sha1hex IS NOT NULL
+            AND cdx.warc_path LIKE 'PLATFORM-CRAWL-2020%'
+        -- LIMIT 5;
+    )
+    TO '/srv/sandcrawler/tasks/ungrobided_platform_crawl.2022-01-07.cdx.json'
+    WITH NULL '';
+    => COPY 8801527
+
+    cat /srv/sandcrawler/tasks/ungrobided_platform_crawl.2022-01-07.cdx.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1
+
+    # for pdfextract, would be: sandcrawler-prod.unextracted
diff --git a/notes/tasks/2022-03-07_ukraine_firedrill.md b/notes/tasks/2022-03-07_ukraine_firedrill.md
new file mode 100644
index 0000000..c727a57
--- /dev/null
+++ b/notes/tasks/2022-03-07_ukraine_firedrill.md
@@ -0,0 +1,225 @@
+
+Want to do priority crawling of Ukranian web content, plus Russia and Belarus.
+
+
+## What is Missing?
+
+    (country_code:ua OR lang:uk)
+    => 2022-03-08, before ingests: 470,986 total, 170,987 missing, almost all article-journal, peak in 2019, 55k explicitly OA
+       later in day, already some 22k missing found! wow
+    => 2022-04-04, after ingests:  476,174 total, 131,063 missing, 49k OA missing
+
+## Metadata Prep
+
+- container metadata update (no code changes)
+    x  wikidata SPARQL update
+    x  chocula run
+    x  journal metadata update (fatcat)
+    x  update journal stats (fatcat extra)
+- DOAJ article metadata import
+    x  prep and upload single JSON file
+
+
+## Journal Homepage URL Crawl
+
+x dump ukraine-related journal homepages from chocula DB
+x create crawl config
+x start crawl
+x repeat for belarus and russia
+
+
+    python3 -m chocula export_urls > homepage_urls.2022-03-08.tsv
+    cat homepage_urls.2022-03-08.tsv | cut -f2 | rg '\.ua/' | sort -u > homepage_urls.2022-03-08.ua_tld.tsv
+    wc -l homepage_urls.2022-03-08.ua_tld.tsv
+    1550 homepage_urls.2022-03-08.ua_tld.tsv
+
+    cat homepage_urls.2022-03-08.tsv | cut -f2 | rg '\.by/' | sort -u > homepage_urls.2022-03-08.by_tld.tsv
+    cat homepage_urls.2022-03-08.tsv | cut -f2 | rg '\.ru/' | sort -u > homepage_urls.2022-03-08.ru_tld.tsv
+
+sqlite3:
+
+    select count(*) from journal where country = 'ua' or lang = 'uk' or name like '%ukrain%' or publi
+    1952
+
+    SELECT COUNT(*) FROM homepage
+    LEFT JOIN journal ON homepage.issnl = journal.issnl
+    WHERE
+        journal.country = 'ua'
+        OR journal.lang = 'uk'
+        OR journal.name like '%ukrain%'
+        OR journal.publisher like '%ukrain%';
+    => 1970
+
+    .mode csv
+    .once homepage_urls_ukraine.tsv
+    SELECT homepage.url FROM homepage
+    LEFT JOIN journal ON homepage.issnl = journal.issnl
+    WHERE
+        journal.country = 'ua'
+        OR journal.lang = 'uk'
+        OR journal.name like '%ukrain%'
+        OR journal.publisher like '%ukrain%';
+
+    .mode csv
+    .once homepage_urls_russia.tsv
+    SELECT homepage.url FROM homepage
+    LEFT JOIN journal ON homepage.issnl = journal.issnl
+    WHERE
+        journal.country = 'ru'
+        OR journal.lang = 'ru'
+        OR journal.name like '%russ%'
+        OR journal.publisher like '%russ%';
+
+    .mode csv
+    .once homepage_urls_belarus.tsv
+    SELECT homepage.url FROM homepage
+    LEFT JOIN journal ON homepage.issnl = journal.issnl
+    WHERE
+        journal.country = 'by'
+        OR journal.lang = 'be'
+        OR journal.name like '%belarus%'
+        OR journal.publisher like '%belarus%';
+
+    cat homepage_urls_ukraine.tsv homepage_urls.2022-03-08.ua_tld.tsv | sort -u > homepage_urls_ukraine_combined.2022-03-08.tsv
+
+    wc -l homepage_urls.2022-03-08.ua_tld.tsv homepage_urls_ukraine.tsv homepage_urls_ukraine_combined.2022-03-08.tsv 
+        1550 homepage_urls.2022-03-08.ua_tld.tsv
+        1971 homepage_urls_ukraine.tsv
+        3482 homepage_urls_ukraine_combined.2022-03-08.tsv
+
+    cat homepage_urls_russia.tsv homepage_urls.2022-03-08.ru_tld.tsv | sort -u > homepage_urls_russia_combined.2022-03-08.tsv
+
+    wc -l homepage_urls_russia.tsv homepage_urls.2022-03-08.ru_tld.tsv homepage_urls_russia_combined.2022-03-08.tsv
+        3728 homepage_urls_russia.tsv
+        2420 homepage_urls.2022-03-08.ru_tld.tsv
+        6030 homepage_urls_russia_combined.2022-03-08.tsv
+
+
+    cat homepage_urls_belarus.tsv homepage_urls.2022-03-08.by_tld.tsv | sort -u > homepage_urls_belarus_combined.2022-03-08.tsv
+
+    wc -l homepage_urls_belarus.tsv homepage_urls.2022-03-08.by_tld.tsv homepage_urls_belarus_combined.2022-03-08.tsv
+        138 homepage_urls_belarus.tsv
+        85 homepage_urls.2022-03-08.by_tld.tsv
+        222 homepage_urls_belarus_combined.2022-03-08.tsv
+
+
+## Landing Page Crawl
+
+x create crawl config
+x fatcat ingest query for related URLs
+    => special request code/label?
+x finish .by and .ru article URL dump, start crawling
+x URL list filtered from new OAI-PMH feed
+    => do we need to do full bulk load/dump, or not?
+- URL list from partner (google)
+- do we need to do alternative thing of iterating over containers, ingesting each?
+
+    ./fatcat_ingest.py --env prod \
+        --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --kafka-request-topic sandcrawler-prod.ingest-file-requests-bulk \
+        --ingest-type pdf \
+        --allow-non-oa \
+        query "country_code:ua OR lang:uk"
+
+    # around Tue 08 Mar 2022 01:07:37 PM PST
+    # Expecting 185659 release objects in search queries
+    # didn't complete successfully? hrm
+
+    # ok, retry "manually" (with kafkacat)
+    ./fatcat_ingest.py --env prod \
+        --ingest-type pdf \
+        --allow-non-oa \
+        query "country_code:ua OR lang:uk" \
+    | pv -l \
+    | gzip \
+    > /srv/fatcat/ingest_ua_pdfs.2022-03-08.requests.json
+    # Counter({'elasticsearch_release': 172881, 'estimate': 172881, 'ingest_request': 103318})
+    # 103k 0:25:04 [68.7 /s]
+
+    zcat /srv/fatcat/ingest_ua_pdfs.2022-03-08.requests.json \
+        | rg -v "\\\\" \
+        | jq . -c \
+        | pv -l \
+        | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+    zcat ingest_ua_pdfs.2022-03-08.requests.json.gz | jq .base_url -r | sort -u | pv -l | gzip > ingest_ua_pdfs.2022-03-08.txt.gz
+    # 103k 0:00:02 [38.1k/s]
+
+    ./fatcat_ingest.py --env prod \
+        --ingest-type pdf \
+        --allow-non-oa \
+        query "country_code:by OR lang:be" \
+    | pv -l \
+    | gzip \
+    > /srv/fatcat/tasks/ingest_by_pdfs.2022-03-09.requests.json.gz
+    # Expecting 2266 release objects in search queries
+    # 1.29k 0:00:34 [37.5 /s]
+
+    zcat /srv/fatcat/tasks/ingest_by_pdfs.2022-03-09.requests.json.gz \
+        | rg -v "\\\\" \
+        | jq . -c \
+        | pv -l \
+        | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+    zcat ingest_by_pdfs.2022-03-09.requests.json.gz | jq .base_url -r | sort -u | pv -l | gzip > ingest_by_pdfs.2022-03-09.txt.gz
+
+    ./fatcat_ingest.py --env prod \
+        --ingest-type pdf \
+        --allow-non-oa \
+        query "country_code:ru OR lang:ru" \
+    | pv -l \
+    | gzip \
+    > /srv/fatcat/tasks/ingest_ru_pdfs.2022-03-09.requests.json.gz
+    # Expecting 1515246 release objects in search queries
+
+    zcat /srv/fatcat/tasks/ingest_ru_pdfs.2022-03-09.requests.partial.json.gz \
+        | rg -v "\\\\" \
+        | jq . -c \
+        | pv -l \
+        | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+    zcat ingest_ru_pdfs.2022-03-09.requests.partial.json.gz | jq .base_url -r | sort -u | pv -l | gzip > ingest_ru_pdfs.2022-03-09.txt.gz
+
+
+    zstdcat oai_pmh_partial_dump_2022_03_01_urls.txt.zst | rg '\.ua/' | pv -l > oai_pmh_partial_dump_2022_03_01_urls.ua_tld.txt
+    # 309k 0:00:03 [81.0k/s]
+
+    zstdcat oai_pmh_partial_dump_2022_03_01_urls.txt.zst | rg '\.by/' | pv -l > oai_pmh_partial_dump_2022_03_01_urls.by_tld.txt
+    # 71.2k 0:00:03 [19.0k/s]
+
+    zstdcat oai_pmh_partial_dump_2022_03_01_urls.txt.zst | rg '\.ru/' | pv -l > oai_pmh_partial_dump_2022_03_01_urls.ru_tld.txt
+    # 276k 0:00:03 [72.9k/s]
+
+
+### Landing Page Bulk Ingest
+
+Running these 2022-03-24, after targeted crawl completed:
+
+    zcat /srv/fatcat/tasks/ingest_ua_pdfs.2022-03-08.requests.json.gz \
+        | rg -v "\\\\" \
+        | jq . -c \
+        | pv -l \
+        | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+    # 103k 0:00:02 [36.1k/s]
+
+    zcat /srv/fatcat/tasks/ingest_by_pdfs.2022-03-09.requests.json.gz \
+        | rg -v "\\\\" \
+        | jq . -c \
+        | pv -l \
+        | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+    # 1.29k 0:00:00 [15.8k/s]
+
+    zcat /srv/fatcat/tasks/ingest_ru_pdfs.2022-03-09.requests.partial.json.gz \
+        | rg -v "\\\\" \
+        | jq . -c \
+        | pv -l \
+        | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+    # 546k 0:00:13 [40.6k/s]
+
+It will probably take a week or more for these to complete.
+
+
+## Outreach
+
+- openalex
+- sucho.org
+- ceeol.com
diff --git a/notes/tasks/2022-04-27_pdf_url_lists.md b/notes/tasks/2022-04-27_pdf_url_lists.md
new file mode 100644
index 0000000..273ff32
--- /dev/null
+++ b/notes/tasks/2022-04-27_pdf_url_lists.md
@@ -0,0 +1,72 @@
+
+Another dump of PDF URLs for partners. This time want to provide TSV with full
+wayback download URLs, as well as "access" URLs.
+
+    export TASKDATE=2022-04-27
+
+## "Ingested", AKA, "Targetted" PDF URLs
+
+These are URLs where we did a successful ingest run.
+
+    COPY (
+        SELECT
+            terminal_sha1hex as pdf_sha1hex,
+            ('https://web.archive.org/web/' || terminal_dt || 'id_/' || terminal_url) as crawl_url,
+            ('https://web.archive.org/web/' || terminal_dt || '/' || terminal_url) as display_url
+        FROM ingest_file_result
+        WHERE
+            ingest_type = 'pdf'
+            AND status = 'success'
+            AND hit = true
+        ORDER BY terminal_sha1hex ASC
+        -- LIMIT 10;
+    )
+    TO '/srv/sandcrawler/tasks/ia_wayback_pdf_ingested.2022-04-27.tsv'
+    WITH NULL '';
+    => COPY 85712674
+
+May contain duplicates, both by sha1hex, URL, or both.
+
+Note that this could be filtered by timestamp, to make it monthly/annual.
+
+
+## All CDX PDFs
+
+"All web PDFs": CDX query; left join file_meta, but don't require
+
+    COPY (
+        SELECT
+            cdx.sha1hex as pdf_sha1hex,
+            ('https://web.archive.org/web/' || cdx.datetime || 'id_/' || cdx.url) as crawl_url,
+            ('https://web.archive.org/web/' || cdx.datetime || '/' || cdx.url) as display_url
+        FROM cdx
+        LEFT JOIN file_meta
+        ON
+            cdx.sha1hex = file_meta.sha1hex
+        WHERE
+            file_meta.mimetype = 'application/pdf'
+            OR (
+                file_meta.mimetype IS NULL
+                AND cdx.mimetype = 'application/pdf'
+            )
+        ORDER BY cdx.sha1hex ASC
+        -- LIMIT 10;
+    )
+    TO '/srv/sandcrawler/tasks/ia_wayback_pdf_speculative.2022-04-27.tsv'
+    WITH NULL '';
+    => COPY 161504070
+
+Should be unique by wayback URL; may contain near-duplicates or duplicates by 
+
+## Upload to archive.org
+
+TODO: next time compress these files first (gzip/pigz)
+
+ia upload ia_scholarly_urls_$TASKDATE \
+    -m collection:ia_biblio_metadata \
+    -m title:"IA Scholarly URLs ($TASKDATE)" \
+    -m date:$TASKDATE \
+    -m creator:"Internet Archive Web Group" \
+    -m description:"URL lists to PDFs on the web (and preserved in the wayback machine) which are likely to contain research materials." \
+    /srv/sandcrawler/tasks/ia_wayback_pdf_ingested.$TASKDATE.tsv /srv/sandcrawler/tasks/ia_wayback_pdf_speculative.$TASKDATE.tsv
+
diff --git a/notes/tasks/2022-11-21_andrzejklimczuk_cleanup.md b/notes/tasks/2022-11-21_andrzejklimczuk_cleanup.md
new file mode 100644
index 0000000..74d3857
--- /dev/null
+++ b/notes/tasks/2022-11-21_andrzejklimczuk_cleanup.md
@@ -0,0 +1,132 @@
+
+Had a huge number of SPN requests for the andrzejklimczuk.com domain,
+presumably from the author.
+
+Many were duplicates (same file, multiple releases, often things like zenodo
+duplication). Many were also GROBID 500s, due to truncated common crawl
+captures.
+
+Needed to cleanup! Basically sorted through a few editgroups manually, then
+rejected all the rest and manually re-submitted with the below queries and
+commands:
+
+    SELECT COUNT(*) from ingest_request
+    LEFT JOIN ingest_file_result ON
+        ingest_file_result.ingest_type = ingest_request.ingest_type
+        AND ingest_file_result.base_url = ingest_request.base_url
+    LEFT JOIN grobid ON
+        grobid.sha1hex = ingest_file_result.terminal_sha1hex
+    WHERE
+        ingest_request.link_source = 'spn'
+        AND ingest_request.ingest_type = 'pdf'
+        AND ingest_request.base_url like 'https://andrzejklimczuk.com/%';
+    => 589
+
+    SELECT ingest_file_result.status, COUNT(*) from ingest_request
+    LEFT JOIN ingest_file_result ON
+        ingest_file_result.ingest_type = ingest_request.ingest_type
+        AND ingest_file_result.base_url = ingest_request.base_url
+    LEFT JOIN grobid ON
+        grobid.sha1hex = ingest_file_result.terminal_sha1hex
+    WHERE
+        ingest_request.link_source = 'spn'
+        AND ingest_request.ingest_type = 'pdf'
+        AND ingest_request.base_url like 'https://andrzejklimczuk.com/%'
+    GROUP BY ingest_file_result.status;
+
+         status     | count 
+    ----------------+-------
+     cdx-error      |     1
+     success        |   587
+     wrong-mimetype |     1
+    (3 rows)
+
+
+    SELECT grobid.status_code, COUNT(*) from ingest_request
+    LEFT JOIN ingest_file_result ON
+        ingest_file_result.ingest_type = ingest_request.ingest_type
+        AND ingest_file_result.base_url = ingest_request.base_url
+    LEFT JOIN grobid ON
+        grobid.sha1hex = ingest_file_result.terminal_sha1hex
+    WHERE
+        ingest_request.link_source = 'spn'
+        AND ingest_request.ingest_type = 'pdf'
+        AND ingest_request.base_url like 'https://andrzejklimczuk.com/%'
+    GROUP BY grobid.status_code;
+
+     status_code | count 
+    -------------+-------
+             200 |   385
+             500 |   202
+                 |     2
+    (3 rows)
+
+
+    COPY (
+        SELECT row_to_json(ingest_request.*) FROM ingest_request
+        LEFT JOIN ingest_file_result ON
+            ingest_file_result.ingest_type = ingest_request.ingest_type
+            AND ingest_file_result.base_url = ingest_request.base_url
+        LEFT JOIN grobid ON
+            grobid.sha1hex = ingest_file_result.terminal_sha1hex
+        WHERE
+            ingest_request.link_source = 'spn'
+            AND ingest_request.ingest_type = 'pdf'
+            AND ingest_request.base_url like 'https://andrzejklimczuk.com/%'
+            AND ingest_file_result.status = 'success'
+            AND grobid.status_code = 500
+    ) TO '/srv/sandcrawler/tasks/andrzejklimczuk_bad_spn.rows.json';
+    => COPY 202
+
+    COPY (
+        SELECT row_to_json(ingest_request.*) FROM ingest_request
+        LEFT JOIN ingest_file_result ON
+            ingest_file_result.ingest_type = ingest_request.ingest_type
+            AND ingest_file_result.base_url = ingest_request.base_url
+        LEFT JOIN grobid ON
+            grobid.sha1hex = ingest_file_result.terminal_sha1hex
+        WHERE
+            ingest_request.link_source = 'spn'
+            AND ingest_request.ingest_type = 'pdf'
+            AND ingest_request.base_url like 'https://andrzejklimczuk.com/%'
+            AND ingest_file_result.status = 'success'
+            AND grobid.status_code = 200
+    ) TO '/srv/sandcrawler/tasks/andrzejklimczuk_good_spn.rows.json';
+    => COPY 385
+
+sudo -u sandcrawler pipenv run \
+    ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/andrzejklimczuk_good_spn.rows.json \
+    > /srv/sandcrawler/tasks/andrzejklimczuk_good_spn.json
+
+sudo -u sandcrawler pipenv run \
+    ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/andrzejklimczuk_bad_spn.rows.json \
+    | jq '. + {force_recrawl: true}' -c \
+    > /srv/sandcrawler/tasks/andrzejklimczuk_bad_spn.json
+
+cat /srv/sandcrawler/tasks/andrzejklimczuk_bad_spn.json \
+    | shuf \
+    | head -n60000 \
+    | jq . -c \
+    | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-priority -p -1
+
+cat /srv/sandcrawler/tasks/andrzejklimczuk_good_spn.json \
+    | shuf \
+    | head -n100 \
+    | jq . -c \
+    | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-priority -p -1
+
+cat /srv/sandcrawler/tasks/andrzejklimczuk_good_spn.json \
+    | shuf \
+    | head -n10000 \
+    | jq . -c \
+    | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-priority -p -1
+
+sudo -u sandcrawler pipenv run \
+    ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/andrzejklimczuk_bad_spn.rows.json \
+    > /srv/sandcrawler/tasks/andrzejklimczuk_bad2_spn.json
+
+cat /srv/sandcrawler/tasks/andrzejklimczuk_bad2_spn.json \
+    | shuf \
+    | head -n60000 \
+    | jq . -c \
+    | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-priority -p -1