diff options
Diffstat (limited to 'notes/tasks')
-rw-r--r-- | notes/tasks/2020-07-22_processing_holes.md | 120 | ||||
-rw-r--r-- | notes/tasks/2020-08-20_file_meta.md | 66 | ||||
-rw-r--r-- | notes/tasks/2020-10-21_pdfextract_holes.md | 74 | ||||
-rw-r--r-- | notes/tasks/2021-09-09_pdf_url_lists.md | 70 | ||||
-rw-r--r-- | notes/tasks/2021-10-29_crossref_refs_backfill.md | 235 | ||||
-rw-r--r-- | notes/tasks/2021-12-06_regrobid.md | 380 | ||||
-rw-r--r-- | notes/tasks/2022-01-07_grobid_platform_pdfs.md | 23 | ||||
-rw-r--r-- | notes/tasks/2022-03-07_ukraine_firedrill.md | 225 | ||||
-rw-r--r-- | notes/tasks/2022-04-27_pdf_url_lists.md | 72 | ||||
-rw-r--r-- | notes/tasks/2022-11-21_andrzejklimczuk_cleanup.md | 132 |
10 files changed, 1397 insertions, 0 deletions
diff --git a/notes/tasks/2020-07-22_processing_holes.md b/notes/tasks/2020-07-22_processing_holes.md new file mode 100644 index 0000000..70e2b59 --- /dev/null +++ b/notes/tasks/2020-07-22_processing_holes.md @@ -0,0 +1,120 @@ + +Want to clean up missing/partial processing (GROBID, `pdf_meta`, `file_meta`) +in sandcrawler database. + + +## `pdf_meta` for petabox rows + +Ran `dump_unextracted_pdf_petabox.sql` SQL, which resulted in a .json file. + + wc -l dump_unextracted_pdf_petabox.2020-07-22.json + 1503086 dump_unextracted_pdf_petabox.2020-07-22.json + +Great, 1.5 million, not too many. Start small: + + head -n1000 dump_unextracted_pdf_petabox.2020-07-22.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.unextracted -p -1 + +Full batch: + + cat dump_unextracted_pdf_petabox.2020-07-22.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.unextracted -p -1 + +Re-ran on 2020-08-19: + + wc -l dump_unextracted_pdf_petabox.2020-08-19.json + 971194 dump_unextracted_pdf_petabox.2020-08-19.json + +## `pdf_meta` missing CDX rows + +First, the GROBID-ized rows but only if has a fatcat file as well. + +10,755,365! That is a lot still to process. + + cat dump_unextracted_pdf.fatcat.2020-07-22.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.unextracted -p -1 + +Re-ran on 2020-08-19: + + wc -l dump_unextracted_pdf.fatcat.2020-08-19.json + 65517 dump_unextracted_pdf.fatcat.2020-08-19.json + +Enqueued! + +## `GROBID` missing petabox rows + + wc -l /grande/snapshots/dump_ungrobided_pdf_petabox.2020-07-22.json + 972221 /grande/snapshots/dump_ungrobided_pdf_petabox.2020-07-22.json + +Start small: + + head -n1000 dump_ungrobided_pdf_petabox.2020-07-22.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1 + +Full batch: + + cat dump_ungrobided_pdf_petabox.2020-07-22.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1 + +Re-ran on 2020-08-19: + + wc -l dump_ungrobided_pdf_petabox.2020-08-19.json + 933 dump_ungrobided_pdf_petabox.2020-08-19.json + +Enqueued! + +## `GROBID` for missing CDX rows in fatcat + + wc -l dump_ungrobided_pdf.fatcat.2020-07-22.json + 1808580 dump_ungrobided_pdf.fatcat.2020-07-22.json + +Full batch: + + cat dump_ungrobided_pdf.fatcat.2020-07-22.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1 + +## `GROBID` for bad status + +Eg, wayback errors. + +TODO + +## `pdf_trio` for OA journal crawls + +TODO + +## `pdf_trio` for "included by heuristic", not in fatcat + +TODO + +## Live-ingest missing arxiv papers + + ./fatcat_ingest.py --allow-non-oa --limit 10000 query arxiv_id:* > /srv/fatcat/snapshots/arxiv_10k_ingest_requests.json + => Expecting 1505184 release objects in search queries + + cat /srv/fatcat/snapshots/arxiv_10k_ingest_requests.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p 22 + +Repeating this every few days should (?) result in all the backlog of arxiv +papers getting indexed. Could focus on recent years to start (with query +filter). + +## re-ingest spn2 errors (all time) + +Eg: + + spn2-cdx-lookup-failure: 143963 + spn-error: 101773 + spn2-error: 16342 + +TODO + +## re-try CDX errors + +Eg, for unpaywall only, bulk ingest all `cdx-error`. + +TODO + +## live ingest unpaywall `no-capture` URLs + +After re-trying the CDX errors for unpaywall URLs (see above), count all the +no-capture URLs, and if reasonable recrawl them all in live more ("reasonable" +meaning fewer than 200k or so URLs). + +Could also force recrawl (not using CDX lookups) for some publisher platforms +if that made sense. + +TODO diff --git a/notes/tasks/2020-08-20_file_meta.md b/notes/tasks/2020-08-20_file_meta.md new file mode 100644 index 0000000..39c84dd --- /dev/null +++ b/notes/tasks/2020-08-20_file_meta.md @@ -0,0 +1,66 @@ + +Want to update fatcat file entities with "full" file metadata for those which are missing it. + +How many `file_meta` rows *still* don't have metadata? + + SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL; + => 62962 + +First generate list of sha1hex from most recent bulk export which are missing +at least some metadata (based on missing sha256): + + zcat file_hashes.tsv.gz | rg '\t\t' | cut -f3 | sort -u -S 4G | pv -l > fatcat_file_partial_sha1hex.tsv + => 18.7M 0:05:46 [53.8k/s] + +Then dump the entire sandcrawler `file_meta` table as TSV, with first column +sha1hex and second column JSON with all the file metadata fields: + + COPY ( + SELECT sha1hex, row_to_json(file_meta) + FROM file_meta + WHERE sha256hex IS NOT NULL + ORDER BY sha1hex ASC + ) + TO '/grande/snapshots/file_meta_dump.tsv' + WITH NULL ''; + +Join/cut: + + export LC_ALL=C + join -t$'\t' fatcat_file_partial_sha1hex.tsv /grande/snapshots/file_meta_dump.tsv | uniq -w 40 | cut -f2 | pv -l > fatcat_file_partial.file_meta.json + => 18.1M 0:03:37 [83.2k/s] + +Check counts: + + cat fatcat_file_partial.file_meta.json | jq .sha1hex -r | sort -u -S 4G | wc -l + => 18135313 + + zcat fatcat_file_partial.file_meta.json.gz | jq .mimetype -r | sort -S 4G | uniq -c | sort -nr + 18103860 application/pdf + 29977 application/octet-stream + 876 text/html + 199 application/postscript + 171 application/gzip + 84 text/plain + 48 application/xml + 38 application/vnd.ms-powerpoint + 16 application/msword + 8 application/vnd.openxmlformats-officedocument.wordprocessingml.document + 6 image/jpeg + 4 message/rfc822 + 4 application/zip + 4 application/vnd.openxmlformats-officedocument.presentationml.presentation + 3 text/x-tex + 3 application/x-dosexec + 2 application/x-tar + 2 application/vnd.ms-tnef + 1 video/mpeg + 1 image/tiff + 1 image/svg+xml + 1 image/png + 1 image/gif + 1 audio/x-ape + 1 application/vnd.ms-office + 1 application/CDFV2-unknown + +TODO: fatcat importer diff --git a/notes/tasks/2020-10-21_pdfextract_holes.md b/notes/tasks/2020-10-21_pdfextract_holes.md new file mode 100644 index 0000000..c0bb65e --- /dev/null +++ b/notes/tasks/2020-10-21_pdfextract_holes.md @@ -0,0 +1,74 @@ + +Realized I had not enabled persisting of PDF extraction results (thumbnail, +text) in ingest worker when added over the summer. So now need to run a +catch-up. This applied to both "live" and "bulk" ingest. + +## `cdx` / `ingest` / `grobid` catch-up + +First, re-run extraction for cases where we did an ingest, and grobid ran +successfully, and we have a CDX row, but no `pdf_meta`: + + -- this is a slow query + COPY ( + SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx) + FROM grobid + LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex + --LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex + LEFT JOIN ingest_file_result ON grobid.sha1hex = ingest_file_result.terminal_sha1hex + LEFT JOIN pdf_meta ON grobid.sha1hex = pdf_meta.sha1hex + WHERE cdx.sha1hex IS NOT NULL + --AND fatcat_file.sha1hex IS NOT NULL + AND ingest_file_result.terminal_sha1hex IS NOT NULL + AND pdf_meta.sha1hex IS NULL + ) + TO '/grande/snapshots/dump_unextracted_pdf.ingest.2020-10-21.json' + WITH NULL ''; + => 19,676,116 + +Wow, that is a lot. Many from recent OAI-PMH and OA crawls, presumably. + + cat /grande/snapshots/dump_unextracted_pdf.ingest.2020-10-21.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.unextracted -p -1 + +And again, after a couple partitions got hung up: + + COPY ( + SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx) + FROM grobid + LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex + --LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex + LEFT JOIN ingest_file_result ON grobid.sha1hex = ingest_file_result.terminal_sha1hex + LEFT JOIN pdf_meta ON grobid.sha1hex = pdf_meta.sha1hex + WHERE cdx.sha1hex IS NOT NULL + --AND fatcat_file.sha1hex IS NOT NULL + AND ingest_file_result.terminal_sha1hex IS NOT NULL + AND pdf_meta.sha1hex IS NULL + ) + TO '/grande/snapshots/dump_unextracted_pdf.ingest.2020-11-04.json' + WITH NULL ''; + + + cat /grande/snapshots/dump_unextracted_pdf.ingest.2020-11-04.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.unextracted -p -1 + => 562k 0:00:16 [34.6k/s] + +## `petabox` / `grobid` catch-up + +These didn't all seem to extract correctly before after 1.5m rows, there will +still 900k unprocessed. Trying again. + + COPY ( + SELECT DISTINCT ON (petabox.sha1hex) row_to_json(petabox) + FROM grobid + LEFT JOIN petabox ON grobid.sha1hex = petabox.sha1hex + LEFT JOIN pdf_meta ON grobid.sha1hex = pdf_meta.sha1hex + WHERE petabox.sha1hex IS NOT NULL + AND pdf_meta.sha1hex IS NULL + ) + TO '/grande/snapshots/dump_unextracted_pdf_petabox.2020-11-04.json' + WITH NULL ''; + + cat /grande/snapshots/dump_unextracted_pdf_petabox.ingest.2020-11-04.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.unextracted -p -1 + +## `cdx` / `grobid` catch-up + +Next will be to process PDFs with GROBID and CDX but no ingest. + diff --git a/notes/tasks/2021-09-09_pdf_url_lists.md b/notes/tasks/2021-09-09_pdf_url_lists.md new file mode 100644 index 0000000..cd8176e --- /dev/null +++ b/notes/tasks/2021-09-09_pdf_url_lists.md @@ -0,0 +1,70 @@ + +Want to dump a URL list to share with partners, filtered to content we think is +likely to be scholarly. + +Columns to include: + +- original URL +- capture timestamp +- SHA1 + +## Stats Overview + +file_meta table, mimetype=application/pdf: 173,816,433 + +cdx table, mimetype=application/pdf: 131,346,703 + +ingest_file_result table, pdf, success: 66,487,928 + +## Ingested PDF URLs + +"Ingested" URLs: ingest_file_result table, pdf and hit=true; include base URL also? + + COPY ( + SELECT + base_url as start_url, + terminal_url as pdf_url, + terminal_dt as pdf_url_timestamp, + terminal_sha1hex as pdf_sha1hex + FROM ingest_file_result + WHERE + ingest_type = 'pdf' + AND status = 'success' + ) + TO '/srv/sandcrawler/tasks/wayback_pdf_targeted.2021-09-09.tsv' + WITH NULL ''; + => 77,892,849 + +## CDX PDFs + +"All web PDFs": CDX query; left join file_meta, but don't require + + COPY ( + SELECT + cdx.url as pdf_url, + cdx.datetime as pdf_url_timestamp, + cdx.sha1hex as pdf_sha1hex + FROM cdx + LEFT JOIN file_meta + ON + cdx.sha1hex = file_meta.sha1hex + WHERE + file_meta.mimetype = 'application/pdf' + OR ( + file_meta.mimetype IS NULL + AND cdx.mimetype = 'application/pdf' + ) + ) + TO '/srv/sandcrawler/tasks/wayback_pdf_speculative.2021-09-09.tsv' + WITH NULL ''; + => 147,837,935 + +## Processed web PDFs + +"Parsed web PDFs": `file_meta`, left join CDX + +(didn't do this one) + +--- + +Uploaded all these to <https://archive.org/download/ia_scholarly_urls_2021-09-09> diff --git a/notes/tasks/2021-10-29_crossref_refs_backfill.md b/notes/tasks/2021-10-29_crossref_refs_backfill.md new file mode 100644 index 0000000..94eefec --- /dev/null +++ b/notes/tasks/2021-10-29_crossref_refs_backfill.md @@ -0,0 +1,235 @@ + +The current sandcrawler-db crossref table was backfilled from a 2021-01 +snapshot, and has not been updated since. + +Would like to use the existing fatcat Kafka feed to keep the crossref table up +to date, and also backfill in GROBID reference parsing of all `unstructured` +references. + +Current plan is: + +1. use kafkacat CLI to dump crossref Kafka topic, from the begining of 2021 up + to some recent date +2. use `persist_tool.py`, with a large batch size (200?) to backfill this dump + into sandcrawler-db. this will update some rows multiple times (if there + have been updates) +3. dump the full crossref table, as a point-in-time snapshot +4. filter to crossref records that have `unstrutured` references in them (at + all) +5. use `grobid_tool.py` with `parallel` to batch process references +6. backfill these refs using a simple SQL COPY statement +7. deploy crossref persist worker, with ref updates on, and roll the consumer + group back to date of dump +8. wait for everything to catch up + + +## Commands + +Get a timestamp in milliseconds: + + 2021-01-01 is: + 1609488000 in unix time (seconds) + 1609488000000 in miliseconds + +Hrm, oldest messages seem to actually be from 2021-04-28T19:21:10Z though. Due +to topic compaction? Yup, we have a 180 day compaction policy on that topic, +probably from when kafka space was tight. Oh well! + +Updated retention for this topic to `46656000000` (~540 days, ~18 months) using +`kafka-manager` web app. + + kafkacat -C -b wbgrp-svc263.us.archive.org -t fatcat-prod.api-crossref -o s@1609488000000 \ + | pv -l \ + | gzip \ + > crossref_feed_start20210428_end20211029.json.gz + +This resulted in ~36 million rows, 46GB. + +`scp` that around, then run persist on `sandcrawler-db`: + + # in pipenv, as sandcrawler user + # manually edited to set batch size to 200 + zcat /srv/sandcrawler/tasks/crossref_feed_start20210428_end20211029.json.gz \ + | pv -l \ + | ./persist_tool.py crossref - + => 36.8M 11:02:43 [ 925 /s] + +With a single thread, the persist process runs at about 1,000 rows/sec, which +works out to about 10 hours for 36 million rows. + +At the start of this process, total PostgreSQL database size is 832.21G. At the +end, 902.51G. Have not run a `VACUUM ALL` or anything like that. + +Query to dump crossref rows which have any refs and compress output with pigz: + + # dump_crossref.sql + COPY ( + SELECT record + FROM crossref + WHERE record::jsonb @? '$.reference[*].unstructured' + -- LIMIT 5 + ) + TO STDOUT + WITH NULL ''; + + # 'sed' required because of double quote escaping in postgresql output:: + # https://stackoverflow.com/questions/29869983/postgres-row-to-json-produces-invalid-json-with-double-escaped-quotes/29871069 + # 'rg' filter is just being conservative + + # XXX: next time add to the pipeline: rg -v "\\\\" + # or, find some way to filter/transform this kind of SQL export better? + psql sandcrawler < dump_crossref.sql \ + | sed 's/\\"/\"/g' \ + | rg '^\{' \ + | pv -l \ + | pigz \ + > /srv/sandcrawler/tasks/crossref_sandcrawler_unstructured.json.gz + => 26.1M 3:22:51 [2.15k/s] + + # NOTE: -j40 is for production run with ~dedicated GROBID server with many cores + zcat /srv/sandcrawler/tasks/crossref_sandcrawler_unstructured.json.gz \ + | rg -v "\\\\" \ + | parallel -j35 --linebuffer --round-robin --pipe ./grobid_tool.py --grobid-host http://wbgrp-svc096.us.archive.org:8070 parse-crossref-refs - \ + | pv -l \ + | pigz \ + > /srv/sandcrawler/tasks/crossref_sandcrawler_unstructured.grobid_refs.json.gz + + # from earlier testing with -j40: able to do about 300-500 records/second + # 23.9k 0:01:14 [ 320 /s] + # 134518 total refs parsed + # ~1817 refs/second parsed + + # with errors, got through about: 2.08M 1:38:20 [ 352 /s] + # was still seing bad JSON? + # JSON lines pushed: Counter({'total': 105898, 'pushed': 105886, 'error-json-decode': 12}) + + # finally, without errors: + # 18.6M 8:35:02 [ 603 /s] + +In the next step, going to need a small direct persist worker to copy lines +verbatim into just the `grobid_refs` table. + +## Errors + +Got errors when running for real: + + xml.etree.ElementTree.ParseError: not well-formed (invalid token): line 114, column 33 + + requests.exceptions.HTTPError: 500 Server Error: Internal Server Error for url: http://wbgrp-svc096.us.archive.org:8070/api/processCitationList + + urllib3.exceptions.MaxRetryError: HTTPConnectionPool(host='wbgrp-svc096.us.archive.org', port=8070): Max retries exceeded with url: /api/processCitationList (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f54b0a3bd00>: Failed to establish a new connection: [Errno 99] Cannot assign requested address')) + + + Nov 03 06:57:32 wbgrp-svc096.us.archive.org GROBID[400404]: ERROR [2021-11-03 06:57:32,569] org.grobid.service.process.GrobidRestProcessString: An unexpected exception occurs. + Nov 03 06:57:32 wbgrp-svc096.us.archive.org GROBID[400404]: ! java.lang.NullPointerException: null + Nov 03 06:57:32 wbgrp-svc096.us.archive.org GROBID[400404]: ! at org.grobid.core.data.BiblioItem.cleanTitles(BiblioItem.java:1784) + Nov 03 06:57:32 wbgrp-svc096.us.archive.org GROBID[400404]: ! at org.grobid.core.engines.CitationParser.processingLayoutTokenMultiple(CitationParser.java:175) + Nov 03 06:57:32 wbgrp-svc096.us.archive.org GROBID[400404]: ! at org.grobid.core.engines.CitationParser.processingStringMultiple(CitationParser.java:92) + Nov 03 06:57:32 wbgrp-svc096.us.archive.org GROBID[400404]: ! at org.grobid.core.engines.Engine.processRawReferences(Engine.java:168) + Nov 03 06:57:32 wbgrp-svc096.us.archive.org GROBID[400404]: ! at org.grobid.service.process.GrobidRestProcessString.processCitationList(GrobidRestProcessString.java:316) + Nov 03 06:57:32 wbgrp-svc096.us.archive.org GROBID[400404]: ! at org.grobid.service.GrobidRestService.processCitationListReturnXml_post(GrobidRestService.java:581) + Nov 03 06:57:32 wbgrp-svc096.us.archive.org GROBID[400404]: ! at sun.reflect.GeneratedMethodAccessor19.invoke(Unknown Source) + Nov 03 06:57:32 wbgrp-svc096.us.archive.org GROBID[400404]: ! at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) + Nov 03 06:57:32 wbgrp-svc096.us.archive.org GROBID[400404]: ! at java.lang.reflect.Method.invoke(Method.java:498) + [...] + +Bogus example reference causing 500 error (among other non-error citations) (doi:10.5817/cz.muni.m210-9541-2019): + + 'Müller, R., Šidák, P. (2012). Slovník novější literární teorie. Praha: Academia.' + '\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0' + 'Šotkovská, J. (2008). Rané divadelní hry Milana Uhdeho; diplomová práce. Brno: Masarykova univerzita.', + +s.strip() in python would remove these non-breaking spaces (update: implemented this later) + + Maheswari, S., Vijayalakshmi, C.: Optimization Model for Electricity Distribution System Control using Communication System by La-grangian Relaxation Technique. CiiT International Journal of Wireless Communication 3(3), 183–187 (2011) (Print: ISSN 0974 – 9756 & Online: ISSN 0974 – 9640) + +Also: + + truncating very large reference list for doi:10.1017/chol9780521264303.033 len:2281 + truncating very large reference list for doi:10.1017/chol9780521263351.011 len:3129 + truncating very large reference list for doi:10.1017/chol9780521263351.022 len:2968 + truncating very large reference list for doi:10.1017/chol9780521264303.036 len:2221 + truncating very large reference list for doi:10.1017/chol9780521264303.007 len:2238 + truncating very large reference list for doi:10.1017/chol9780521086912.001 len:2177 + truncating very large reference list for doi:10.1017/chol9780521228046.002 len:2133 + truncating very large reference list for doi:10.1017/chol9780521264303.035 len:2221 + truncating very large reference list for doi:10.1017/chol9780521264303.002 len:2279 + +Seems like bumping to 2500 as the maximum reference list size might be +reasonable (it is 2000 currently). + +After some refactoring, still getting: + + requests.exceptions.ConnectionError + +This is because I am doing POST without a session. + +Then, still got requests.exceptions.ReadTimeout + +Finally, got through the whole batch, (`18.6M 8:35:02 [ 603 /s]` output), with +only a few dozen rows like: + + GROBID returned bad XML for Crossref DOI: 10.1007/978-3-030-03008-7_21-1 + GROBID HTTP timeout for Crossref DOI: 10.1007/978-1-4757-1496-8_3 + GROBID HTTP timeout for Crossref DOI: 10.1007/978-1-4757-1493-7_3 + GROBID returned bad XML for Crossref DOI: 10.1007/978-3-319-96184-2_2 + GROBID returned bad XML for Crossref DOI: 10.1063/1.5031970 + truncating very large reference list for doi:10.1007/978-1-4757-1499-9_15 len:11401 + GROBID returned bad XML for Crossref DOI: 10.1016/j.oraloncology.2019.104562 + GROBID returned bad XML for Crossref DOI: 10.1016/j.pec.2020.04.010 + +So things seem to be working! + +Summary lines looked like: + + JSON lines pushed: Counter({'total': 531487, 'pushed': 531487}) + Worker: Counter({'total': 536541, 'failed': 3}) + +Failures per batch were on the order of 0 to 3. + +## Postgres Backfill + +Start with a sample: + + zcat /srv/sandcrawler/tasks/crossref_sandcrawler_unstructured.grobid_refs.json.gz \ + | head -n1000 \ + | ./persist_tool.py grobid-refs - + # Worker: Counter({'total': 1000, 'insert-grobid_refs': 1000, 'update-grobid_refs': 0}) + + # same command again: + # Worker: Counter({'total': 1000, 'update-grobid_refs': 1000, 'insert-grobid_refs': 0}) + +Example DOIs: + + # no refs + 10.1007/978-1-349-04135-0_3 + http get :3030/crossref_with_refs "doi==eq.10.1007/978-1-349-04135-0_3" + + # with refs + 10.1007/978-1-349-03594-6_2 + http get :3030/crossref_with_refs "doi==eq.10.1007/978-1-349-03594-6_2" + +Seems to be working, so will do the full backfill. Can check table sizes on a +per-table basis when complete. + + zcat /srv/sandcrawler/tasks/crossref_sandcrawler_unstructured.grobid_refs.json.gz \ + | pv -l \ + | ./persist_tool.py grobid-refs - + # Worker: Counter({'total': 18646668, 'insert-grobid_refs': 18639195, 'update-grobid_refs': 7473}) + + +## Kafka Setup + +Added ansible config and deployed persist-crossref worker. + +First roll-back just a couple days as a test: + + ./kafka-consumer-groups.sh --bootstrap-server localhost:9092 --group persist-crossref --reset-offsets --topic fatcat-prod.api-crossref --to-datetime 2021-11-07T00:00:00.000 + + # eg: Import counts: Counter({'total': 372350, 'insert-grobid_refs': 326987, 'update-crossref': 265581, 'insert-crossref': 106769, 'update-grobid_refs': 45362, 'skip': 1}) + +Then roll-back to before the snapshot and backfill, to catch up: + + ./kafka-consumer-groups.sh --bootstrap-server localhost:9092 --group persist-crossref --reset-offsets --topic fatcat-prod.api-crossref --to-datetime 2021-10-26T00:00:00.000 + +Ran this last command on 2021-11-10, and total lag was around 2,566,741. diff --git a/notes/tasks/2021-12-06_regrobid.md b/notes/tasks/2021-12-06_regrobid.md new file mode 100644 index 0000000..5fb69d1 --- /dev/null +++ b/notes/tasks/2021-12-06_regrobid.md @@ -0,0 +1,380 @@ + +Want to test recent updates of GROBID (to fix regex issue), and also re-process +a number of PDFs which failed to process with GROBID initially. + + +## HTTP 503 + +These are attempts which failed because GROBID was too busy or not running. + + # IMPROVED BELOW + COPY ( + SELECT row_to_json(cdx) + FROM grobid + LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex + WHERE + grobid.status_code = 503 + AND cdx.sha1hex IS NOT NULL + -- LIMIT 5; + ) + TO '/srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-06.grobid503.json' + WITH NULL ''; + # COPY 4749 + +Not actually that many, which seems good. Confirm that these are uniq by sha1hex: + + cat ungrobided_fatcat.2021-12-06.grobid503.json | jq .sha1hex -r | sort | uniq -d | wc -l + # 302 + +Nope! Need to add "distinct on": + + COPY ( + SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx) + FROM grobid + LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex + WHERE + grobid.status_code = 503 + AND cdx.sha1hex IS NOT NULL + -- LIMIT 5; + ) + TO '/srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-06.grobid503.json' + WITH NULL ''; + # COPY 4297 + + cat /srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-06.grobid503.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1 + +## Never Processed CDX + +PDFs in fatcat which have never been processed with GROBID. + + COPY ( + SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx) + FROM fatcat_file + LEFT JOIN cdx ON fatcat_file.sha1hex = cdx.sha1hex + LEFT JOIN grobid ON grobid.sha1hex = fatcat_file.sha1hex + LEFT JOIN file_meta ON file_meta.sha1hex = fatcat_file.sha1hex + WHERE + grobid.sha1hex IS NULL + AND cdx.sha1hex IS NOT NULL + AND (file_meta.mimetype = 'application/pdf' OR file_meta.mimetype IS NULL) + -- LIMIT 5; + ) + TO '/srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-06.cdx.json' + WITH NULL ''; + # COPY 15488 + + cat /srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-06.cdx.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1 + + +PDFs in fatcat which have never been processed with pdfextract. + + # TODO + COPY ( + SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx) + FROM fatcat_file + LEFT JOIN cdx ON fatcat_file.sha1hex = cdx.sha1hex + LEFT JOIN pdf_meta ON pdf_meta.sha1hex = fatcat_file.sha1hex + LEFT JOIN file_meta ON file_meta.sha1hex = fatcat_file.sha1hex + WHERE + pdf_meta.sha1hex IS NULL + AND cdx.sha1hex IS NOT NULL + AND cdx.mimetype = 'application/pdf' + AND (file_meta.mimetype = 'application/pdf' OR file_meta.mimetype IS NULL) + -- LIMIT 5; + ) + TO '/srv/sandcrawler/tasks/unextracted_fatcat.2021-12-08.cdx.json' + WITH NULL ''; + # COPY 45535 + + cat /srv/sandcrawler/tasks/unextracted_fatcat.2021-12-08.cdx.json \ + | rg -v "\\\\" \ + | jq . -c \ + | pv -l \ + | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.unextracted -p -1 + # 45.5k 0:00:01 [30.2k/s] + +## Timeout or Failure + + COPY ( + SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx) + FROM grobid + LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex + LEFT JOIN file_meta ON grobid.sha1hex = file_meta.sha1hex + WHERE + (grobid.status_code = 500 OR grobid.status_code = -4) + AND cdx.sha1hex IS NOT NULL + AND file_meta.mimetype = 'application/pdf' + -- LIMIT 5; + ) + TO '/srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-06.grobid_failed.json' + WITH NULL ''; + # COPY 8,084,296 + + cat /srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-06.grobid_failed.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1 + +This seems to not be working very well, mostly errors, empty docs, etc. Will +roll-forward the kafka consumer group after attempting a couple hundred +thousand of these. + +Let's try limiting to files actually in fatcat: + + COPY ( + SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx) + FROM grobid + LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex + LEFT JOIN file_meta ON grobid.sha1hex = file_meta.sha1hex + LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex + WHERE + (grobid.status_code = 500 OR grobid.status_code = -4) + AND cdx.sha1hex IS NOT NULL + AND fatcat_file.sha1hex IS NOT NULL + AND file_meta.mimetype = 'application/pdf' + -- sort of arbitary "not recently" date filter + AND (grobid.updated IS NULL OR grobid.updated < '2021-11-15') + -- LIMIT 5; + ) + TO '/srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-08.grobid_failed.json' + WITH NULL ''; + # COPY 529265 + +That is a much more managable batch to retry. + + cat /srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-08.grobid_failed.json \ + | rg -v "\\\\" \ + | jq . -c \ + | pv -l \ + | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1 + # 529k 0:00:17 [31.0k/s] + + +## Missing Fatcat Files + +There were around a half million fatcat file entities which didn't have `cdx` +rows in sandcrawler. Did some specific pdfextract processing; now we should do +GROBID ingest as well. + +Enque the `CDX` objects for GROBID and pdfextract processing: + + zcat /schnell/fatcat_cleanups/file_meta/files_missing_sha256.cdx_rows.json.gz \ + | rg -v "\\\\" \ + | jq . -c \ + | pv -l \ + | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1 + # 354k 0:00:11 [30.6k/s] + + zcat /schnell/fatcat_cleanups/file_meta/files_missing_sha256.cdx_rows.json.gz \ + | rg -v "\\\\" \ + | jq . -c \ + | pv -l \ + | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.unextracted -p -1 + +And some earlier files of interest on `aitio`: + + cat files_missing_sha256.ingest_results.json \ + | rg '"application/pdf"' \ + | rg -v "\\\\" \ + | jq .cdx -c \ + | sort -u -S 4G \ + | pv -l \ + > files_missing_sha256.cdx.uniq.json + # 100k 0:00:47 [2.09k/s] + + cat files_missing_sha256.cdx.uniq.json \ + | rg -v "\\\\" \ + | jq . -c \ + | pv -l \ + | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1 + + cat files_missing_sha256.cdx.uniq.json \ + | rg -v "\\\\" \ + | jq . -c \ + | pv -l \ + | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.unextracted -p -1 + + +## Ancient Fatcat Files + +Files from an era where we didn't record GROBID version or status, even for +success. + + COPY ( + SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx) + FROM grobid + LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex + LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex + WHERE + grobid.status_code = 200 + AND grobid.status IS NULL + AND cdx.sha1hex IS NOT NULL + AND fatcat_file.sha1hex IS NOT NULL + -- sort of arbitary "not recently" date filter + AND (grobid.updated IS NULL OR grobid.updated < '2021-11-15') + -- LIMIT 5; + ) + TO '/srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-11.grobid_status_null.json' + WITH NULL ''; + + cat /srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-11.grobid_status_null.json \ + | rg -v "\\\\" \ + | jq . -c \ + | pv -l \ + | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1 + # 107k 0:00:03 [29.9k/s] + + +## Start Re-Processing Old GROBID Versions + + COPY ( + SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx) + FROM grobid + LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex + LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex + WHERE + grobid.status = 'success' + AND grobid.grobid_version NOT LIKE '0.7.%' + AND cdx.sha1hex IS NOT NULL + AND fatcat_file.sha1hex IS NOT NULL + -- sort of arbitary "not recently" date filter + AND (grobid.updated IS NULL OR grobid.updated < '2021-11-15') + -- LIMIT 5; + ) + TO '/srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-11.grobid_old.json' + WITH NULL ''; + +This one is huge, and want to process in batches/chunks of ~8 million at a time. + + cd /srv/sandcrawler/tasks/ + cat /srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-11.grobid_old.json \ + | split --lines 5000000 - ungrobided_fatcat.2021-12-11.grobid_old.split_ -d --additional-suffix .json + +Submit individual batches like: + + cat /srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-11.grobid_old.split_01.json \ + | rg -v "\\\\" \ + | jq . -c \ + | pv -l \ + | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1 + +Overall progress: + + x ungrobided_fatcat.2021-12-11.grobid_old.split_00.json + x ungrobided_fatcat.2021-12-11.grobid_old.split_01.json + x ungrobided_fatcat.2021-12-11.grobid_old.split_02.json + x ungrobided_fatcat.2021-12-11.grobid_old.split_03.json + x ungrobided_fatcat.2021-12-11.grobid_old.split_04.json + x ungrobided_fatcat.2021-12-11.grobid_old.split_05.json + x ungrobided_fatcat.2021-12-11.grobid_old.split_06.json + x ungrobided_fatcat.2021-12-11.grobid_old.split_07.json + x ungrobided_fatcat.2021-12-11.grobid_old.split_08.json (small) + +This finally finished on 2022-04-26. Horray! + +## General Counts + +How many fatcat files of what mimetype (reported in sandcrawler-db)? + + SELECT file_meta.mimetype, COUNT(*) + FROM fatcat_file + LEFT JOIN file_meta ON fatcat_file.sha1hex = file_meta.sha1hex + WHERE + fatcat_file.first_release_ident IS NOT NULL + AND fatcat_file.any_url = true + AND content_scope IS NULL + GROUP BY file_meta.mimetype + ORDER BY COUNT(*) DESC + LIMIT 25; + + mimetype | count + ---------------------------------------------------------------------------+---------- + application/pdf | 45227033 + | 433068 + application/octet-stream | 30634 + application/jats+xml | 6874 + text/html | 876 + application/postscript | 199 + application/gzip | 173 + text/plain | 84 + application/xml | 48 + application/vnd.ms-powerpoint | 38 + application/msword | 16 + application/vnd.openxmlformats-officedocument.wordprocessingml.document | 8 + image/jpeg | 6 + application/vnd.openxmlformats-officedocument.presentationml.presentation | 4 + message/rfc822 | 4 + application/zip | 4 + text/x-tex | 3 + application/x-dosexec | 3 + application/x-tar | 2 + application/vnd.ms-tnef | 2 + image/svg+xml | 1 + image/tiff | 1 + image/png | 1 + image/gif | 1 + application/vnd.ms-office | 1 + (25 rows) + + +PDF extract status? + + SELECT pdf_meta.status, COUNT(*) + FROM fatcat_file + LEFT JOIN pdf_meta ON fatcat_file.sha1hex = pdf_meta.sha1hex + WHERE + fatcat_file.first_release_ident IS NOT NULL + AND fatcat_file.any_url = true + AND content_scope IS NULL + GROUP BY pdf_meta.status + ORDER BY COUNT(*) DESC + LIMIT 25; + + status | count + ----------------+---------- + success | 43415920 + | 2018522 + text-too-large | 122730 + parse-error | 94876 + not-pdf | 32156 + error-wayback | 14504 + bad-unicode | 279 + bad-pdf | 98 + empty-blob | 2 + (9 rows) + + +What are the GROBID status codes for fatcat files? Narrowed down: + + SELECT grobid.status, grobid.status_code, COUNT(*) + FROM fatcat_file + LEFT JOIN grobid ON fatcat_file.sha1hex = grobid.sha1hex + WHERE + fatcat_file.first_release_ident IS NOT NULL + AND fatcat_file.any_url = true + AND content_scope IS NULL + GROUP BY grobid.status, grobid.status_code + ORDER BY COUNT(*) DESC + LIMIT 25; + + status | status_code | count + ----------------+-------------+---------- + success | 200 | 44409069 + error | 500 | 580402 + | | 468836 + | 200 | 240660 + error-timeout | -4 | 79 + bad-grobid-xml | 200 | 38 + error | 200 | 3 + (7 rows) + +Ran the same query again on 2021-12-15: + + status | status_code | count + ----------------+-------------+---------- + success | 200 | 45092915 + error | 500 | 302373 + | | 250335 + | 200 | 53352 + bad-grobid-xml | 200 | 39 + error-timeout | -4 | 37 + error | 200 | 34 + error | 503 | 2 + (8 rows) diff --git a/notes/tasks/2022-01-07_grobid_platform_pdfs.md b/notes/tasks/2022-01-07_grobid_platform_pdfs.md new file mode 100644 index 0000000..b5422c2 --- /dev/null +++ b/notes/tasks/2022-01-07_grobid_platform_pdfs.md @@ -0,0 +1,23 @@ + +Martin crawled more than 10 million new PDFs from various platform domains. We +should get these processed and included in sandcrawler-db. + +## Select CDX Rows + + COPY ( + SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx) + FROM cdx + LEFT JOIN grobid ON grobid.sha1hex = cdx.sha1hex + WHERE + grobid.sha1hex IS NULL + AND cdx.sha1hex IS NOT NULL + AND cdx.warc_path LIKE 'PLATFORM-CRAWL-2020%' + -- LIMIT 5; + ) + TO '/srv/sandcrawler/tasks/ungrobided_platform_crawl.2022-01-07.cdx.json' + WITH NULL ''; + => COPY 8801527 + + cat /srv/sandcrawler/tasks/ungrobided_platform_crawl.2022-01-07.cdx.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1 + + # for pdfextract, would be: sandcrawler-prod.unextracted diff --git a/notes/tasks/2022-03-07_ukraine_firedrill.md b/notes/tasks/2022-03-07_ukraine_firedrill.md new file mode 100644 index 0000000..c727a57 --- /dev/null +++ b/notes/tasks/2022-03-07_ukraine_firedrill.md @@ -0,0 +1,225 @@ + +Want to do priority crawling of Ukranian web content, plus Russia and Belarus. + + +## What is Missing? + + (country_code:ua OR lang:uk) + => 2022-03-08, before ingests: 470,986 total, 170,987 missing, almost all article-journal, peak in 2019, 55k explicitly OA + later in day, already some 22k missing found! wow + => 2022-04-04, after ingests: 476,174 total, 131,063 missing, 49k OA missing + +## Metadata Prep + +- container metadata update (no code changes) + x wikidata SPARQL update + x chocula run + x journal metadata update (fatcat) + x update journal stats (fatcat extra) +- DOAJ article metadata import + x prep and upload single JSON file + + +## Journal Homepage URL Crawl + +x dump ukraine-related journal homepages from chocula DB +x create crawl config +x start crawl +x repeat for belarus and russia + + + python3 -m chocula export_urls > homepage_urls.2022-03-08.tsv + cat homepage_urls.2022-03-08.tsv | cut -f2 | rg '\.ua/' | sort -u > homepage_urls.2022-03-08.ua_tld.tsv + wc -l homepage_urls.2022-03-08.ua_tld.tsv + 1550 homepage_urls.2022-03-08.ua_tld.tsv + + cat homepage_urls.2022-03-08.tsv | cut -f2 | rg '\.by/' | sort -u > homepage_urls.2022-03-08.by_tld.tsv + cat homepage_urls.2022-03-08.tsv | cut -f2 | rg '\.ru/' | sort -u > homepage_urls.2022-03-08.ru_tld.tsv + +sqlite3: + + select count(*) from journal where country = 'ua' or lang = 'uk' or name like '%ukrain%' or publi + 1952 + + SELECT COUNT(*) FROM homepage + LEFT JOIN journal ON homepage.issnl = journal.issnl + WHERE + journal.country = 'ua' + OR journal.lang = 'uk' + OR journal.name like '%ukrain%' + OR journal.publisher like '%ukrain%'; + => 1970 + + .mode csv + .once homepage_urls_ukraine.tsv + SELECT homepage.url FROM homepage + LEFT JOIN journal ON homepage.issnl = journal.issnl + WHERE + journal.country = 'ua' + OR journal.lang = 'uk' + OR journal.name like '%ukrain%' + OR journal.publisher like '%ukrain%'; + + .mode csv + .once homepage_urls_russia.tsv + SELECT homepage.url FROM homepage + LEFT JOIN journal ON homepage.issnl = journal.issnl + WHERE + journal.country = 'ru' + OR journal.lang = 'ru' + OR journal.name like '%russ%' + OR journal.publisher like '%russ%'; + + .mode csv + .once homepage_urls_belarus.tsv + SELECT homepage.url FROM homepage + LEFT JOIN journal ON homepage.issnl = journal.issnl + WHERE + journal.country = 'by' + OR journal.lang = 'be' + OR journal.name like '%belarus%' + OR journal.publisher like '%belarus%'; + + cat homepage_urls_ukraine.tsv homepage_urls.2022-03-08.ua_tld.tsv | sort -u > homepage_urls_ukraine_combined.2022-03-08.tsv + + wc -l homepage_urls.2022-03-08.ua_tld.tsv homepage_urls_ukraine.tsv homepage_urls_ukraine_combined.2022-03-08.tsv + 1550 homepage_urls.2022-03-08.ua_tld.tsv + 1971 homepage_urls_ukraine.tsv + 3482 homepage_urls_ukraine_combined.2022-03-08.tsv + + cat homepage_urls_russia.tsv homepage_urls.2022-03-08.ru_tld.tsv | sort -u > homepage_urls_russia_combined.2022-03-08.tsv + + wc -l homepage_urls_russia.tsv homepage_urls.2022-03-08.ru_tld.tsv homepage_urls_russia_combined.2022-03-08.tsv + 3728 homepage_urls_russia.tsv + 2420 homepage_urls.2022-03-08.ru_tld.tsv + 6030 homepage_urls_russia_combined.2022-03-08.tsv + + + cat homepage_urls_belarus.tsv homepage_urls.2022-03-08.by_tld.tsv | sort -u > homepage_urls_belarus_combined.2022-03-08.tsv + + wc -l homepage_urls_belarus.tsv homepage_urls.2022-03-08.by_tld.tsv homepage_urls_belarus_combined.2022-03-08.tsv + 138 homepage_urls_belarus.tsv + 85 homepage_urls.2022-03-08.by_tld.tsv + 222 homepage_urls_belarus_combined.2022-03-08.tsv + + +## Landing Page Crawl + +x create crawl config +x fatcat ingest query for related URLs + => special request code/label? +x finish .by and .ru article URL dump, start crawling +x URL list filtered from new OAI-PMH feed + => do we need to do full bulk load/dump, or not? +- URL list from partner (google) +- do we need to do alternative thing of iterating over containers, ingesting each? + + ./fatcat_ingest.py --env prod \ + --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --kafka-request-topic sandcrawler-prod.ingest-file-requests-bulk \ + --ingest-type pdf \ + --allow-non-oa \ + query "country_code:ua OR lang:uk" + + # around Tue 08 Mar 2022 01:07:37 PM PST + # Expecting 185659 release objects in search queries + # didn't complete successfully? hrm + + # ok, retry "manually" (with kafkacat) + ./fatcat_ingest.py --env prod \ + --ingest-type pdf \ + --allow-non-oa \ + query "country_code:ua OR lang:uk" \ + | pv -l \ + | gzip \ + > /srv/fatcat/ingest_ua_pdfs.2022-03-08.requests.json + # Counter({'elasticsearch_release': 172881, 'estimate': 172881, 'ingest_request': 103318}) + # 103k 0:25:04 [68.7 /s] + + zcat /srv/fatcat/ingest_ua_pdfs.2022-03-08.requests.json \ + | rg -v "\\\\" \ + | jq . -c \ + | pv -l \ + | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + + zcat ingest_ua_pdfs.2022-03-08.requests.json.gz | jq .base_url -r | sort -u | pv -l | gzip > ingest_ua_pdfs.2022-03-08.txt.gz + # 103k 0:00:02 [38.1k/s] + + ./fatcat_ingest.py --env prod \ + --ingest-type pdf \ + --allow-non-oa \ + query "country_code:by OR lang:be" \ + | pv -l \ + | gzip \ + > /srv/fatcat/tasks/ingest_by_pdfs.2022-03-09.requests.json.gz + # Expecting 2266 release objects in search queries + # 1.29k 0:00:34 [37.5 /s] + + zcat /srv/fatcat/tasks/ingest_by_pdfs.2022-03-09.requests.json.gz \ + | rg -v "\\\\" \ + | jq . -c \ + | pv -l \ + | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + + zcat ingest_by_pdfs.2022-03-09.requests.json.gz | jq .base_url -r | sort -u | pv -l | gzip > ingest_by_pdfs.2022-03-09.txt.gz + + ./fatcat_ingest.py --env prod \ + --ingest-type pdf \ + --allow-non-oa \ + query "country_code:ru OR lang:ru" \ + | pv -l \ + | gzip \ + > /srv/fatcat/tasks/ingest_ru_pdfs.2022-03-09.requests.json.gz + # Expecting 1515246 release objects in search queries + + zcat /srv/fatcat/tasks/ingest_ru_pdfs.2022-03-09.requests.partial.json.gz \ + | rg -v "\\\\" \ + | jq . -c \ + | pv -l \ + | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + + zcat ingest_ru_pdfs.2022-03-09.requests.partial.json.gz | jq .base_url -r | sort -u | pv -l | gzip > ingest_ru_pdfs.2022-03-09.txt.gz + + + zstdcat oai_pmh_partial_dump_2022_03_01_urls.txt.zst | rg '\.ua/' | pv -l > oai_pmh_partial_dump_2022_03_01_urls.ua_tld.txt + # 309k 0:00:03 [81.0k/s] + + zstdcat oai_pmh_partial_dump_2022_03_01_urls.txt.zst | rg '\.by/' | pv -l > oai_pmh_partial_dump_2022_03_01_urls.by_tld.txt + # 71.2k 0:00:03 [19.0k/s] + + zstdcat oai_pmh_partial_dump_2022_03_01_urls.txt.zst | rg '\.ru/' | pv -l > oai_pmh_partial_dump_2022_03_01_urls.ru_tld.txt + # 276k 0:00:03 [72.9k/s] + + +### Landing Page Bulk Ingest + +Running these 2022-03-24, after targeted crawl completed: + + zcat /srv/fatcat/tasks/ingest_ua_pdfs.2022-03-08.requests.json.gz \ + | rg -v "\\\\" \ + | jq . -c \ + | pv -l \ + | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + # 103k 0:00:02 [36.1k/s] + + zcat /srv/fatcat/tasks/ingest_by_pdfs.2022-03-09.requests.json.gz \ + | rg -v "\\\\" \ + | jq . -c \ + | pv -l \ + | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + # 1.29k 0:00:00 [15.8k/s] + + zcat /srv/fatcat/tasks/ingest_ru_pdfs.2022-03-09.requests.partial.json.gz \ + | rg -v "\\\\" \ + | jq . -c \ + | pv -l \ + | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + # 546k 0:00:13 [40.6k/s] + +It will probably take a week or more for these to complete. + + +## Outreach + +- openalex +- sucho.org +- ceeol.com diff --git a/notes/tasks/2022-04-27_pdf_url_lists.md b/notes/tasks/2022-04-27_pdf_url_lists.md new file mode 100644 index 0000000..273ff32 --- /dev/null +++ b/notes/tasks/2022-04-27_pdf_url_lists.md @@ -0,0 +1,72 @@ + +Another dump of PDF URLs for partners. This time want to provide TSV with full +wayback download URLs, as well as "access" URLs. + + export TASKDATE=2022-04-27 + +## "Ingested", AKA, "Targetted" PDF URLs + +These are URLs where we did a successful ingest run. + + COPY ( + SELECT + terminal_sha1hex as pdf_sha1hex, + ('https://web.archive.org/web/' || terminal_dt || 'id_/' || terminal_url) as crawl_url, + ('https://web.archive.org/web/' || terminal_dt || '/' || terminal_url) as display_url + FROM ingest_file_result + WHERE + ingest_type = 'pdf' + AND status = 'success' + AND hit = true + ORDER BY terminal_sha1hex ASC + -- LIMIT 10; + ) + TO '/srv/sandcrawler/tasks/ia_wayback_pdf_ingested.2022-04-27.tsv' + WITH NULL ''; + => COPY 85712674 + +May contain duplicates, both by sha1hex, URL, or both. + +Note that this could be filtered by timestamp, to make it monthly/annual. + + +## All CDX PDFs + +"All web PDFs": CDX query; left join file_meta, but don't require + + COPY ( + SELECT + cdx.sha1hex as pdf_sha1hex, + ('https://web.archive.org/web/' || cdx.datetime || 'id_/' || cdx.url) as crawl_url, + ('https://web.archive.org/web/' || cdx.datetime || '/' || cdx.url) as display_url + FROM cdx + LEFT JOIN file_meta + ON + cdx.sha1hex = file_meta.sha1hex + WHERE + file_meta.mimetype = 'application/pdf' + OR ( + file_meta.mimetype IS NULL + AND cdx.mimetype = 'application/pdf' + ) + ORDER BY cdx.sha1hex ASC + -- LIMIT 10; + ) + TO '/srv/sandcrawler/tasks/ia_wayback_pdf_speculative.2022-04-27.tsv' + WITH NULL ''; + => COPY 161504070 + +Should be unique by wayback URL; may contain near-duplicates or duplicates by + +## Upload to archive.org + +TODO: next time compress these files first (gzip/pigz) + +ia upload ia_scholarly_urls_$TASKDATE \ + -m collection:ia_biblio_metadata \ + -m title:"IA Scholarly URLs ($TASKDATE)" \ + -m date:$TASKDATE \ + -m creator:"Internet Archive Web Group" \ + -m description:"URL lists to PDFs on the web (and preserved in the wayback machine) which are likely to contain research materials." \ + /srv/sandcrawler/tasks/ia_wayback_pdf_ingested.$TASKDATE.tsv /srv/sandcrawler/tasks/ia_wayback_pdf_speculative.$TASKDATE.tsv + diff --git a/notes/tasks/2022-11-21_andrzejklimczuk_cleanup.md b/notes/tasks/2022-11-21_andrzejklimczuk_cleanup.md new file mode 100644 index 0000000..74d3857 --- /dev/null +++ b/notes/tasks/2022-11-21_andrzejklimczuk_cleanup.md @@ -0,0 +1,132 @@ + +Had a huge number of SPN requests for the andrzejklimczuk.com domain, +presumably from the author. + +Many were duplicates (same file, multiple releases, often things like zenodo +duplication). Many were also GROBID 500s, due to truncated common crawl +captures. + +Needed to cleanup! Basically sorted through a few editgroups manually, then +rejected all the rest and manually re-submitted with the below queries and +commands: + + SELECT COUNT(*) from ingest_request + LEFT JOIN ingest_file_result ON + ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + LEFT JOIN grobid ON + grobid.sha1hex = ingest_file_result.terminal_sha1hex + WHERE + ingest_request.link_source = 'spn' + AND ingest_request.ingest_type = 'pdf' + AND ingest_request.base_url like 'https://andrzejklimczuk.com/%'; + => 589 + + SELECT ingest_file_result.status, COUNT(*) from ingest_request + LEFT JOIN ingest_file_result ON + ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + LEFT JOIN grobid ON + grobid.sha1hex = ingest_file_result.terminal_sha1hex + WHERE + ingest_request.link_source = 'spn' + AND ingest_request.ingest_type = 'pdf' + AND ingest_request.base_url like 'https://andrzejklimczuk.com/%' + GROUP BY ingest_file_result.status; + + status | count + ----------------+------- + cdx-error | 1 + success | 587 + wrong-mimetype | 1 + (3 rows) + + + SELECT grobid.status_code, COUNT(*) from ingest_request + LEFT JOIN ingest_file_result ON + ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + LEFT JOIN grobid ON + grobid.sha1hex = ingest_file_result.terminal_sha1hex + WHERE + ingest_request.link_source = 'spn' + AND ingest_request.ingest_type = 'pdf' + AND ingest_request.base_url like 'https://andrzejklimczuk.com/%' + GROUP BY grobid.status_code; + + status_code | count + -------------+------- + 200 | 385 + 500 | 202 + | 2 + (3 rows) + + + COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result ON + ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + LEFT JOIN grobid ON + grobid.sha1hex = ingest_file_result.terminal_sha1hex + WHERE + ingest_request.link_source = 'spn' + AND ingest_request.ingest_type = 'pdf' + AND ingest_request.base_url like 'https://andrzejklimczuk.com/%' + AND ingest_file_result.status = 'success' + AND grobid.status_code = 500 + ) TO '/srv/sandcrawler/tasks/andrzejklimczuk_bad_spn.rows.json'; + => COPY 202 + + COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result ON + ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + LEFT JOIN grobid ON + grobid.sha1hex = ingest_file_result.terminal_sha1hex + WHERE + ingest_request.link_source = 'spn' + AND ingest_request.ingest_type = 'pdf' + AND ingest_request.base_url like 'https://andrzejklimczuk.com/%' + AND ingest_file_result.status = 'success' + AND grobid.status_code = 200 + ) TO '/srv/sandcrawler/tasks/andrzejklimczuk_good_spn.rows.json'; + => COPY 385 + +sudo -u sandcrawler pipenv run \ + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/andrzejklimczuk_good_spn.rows.json \ + > /srv/sandcrawler/tasks/andrzejklimczuk_good_spn.json + +sudo -u sandcrawler pipenv run \ + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/andrzejklimczuk_bad_spn.rows.json \ + | jq '. + {force_recrawl: true}' -c \ + > /srv/sandcrawler/tasks/andrzejklimczuk_bad_spn.json + +cat /srv/sandcrawler/tasks/andrzejklimczuk_bad_spn.json \ + | shuf \ + | head -n60000 \ + | jq . -c \ + | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-priority -p -1 + +cat /srv/sandcrawler/tasks/andrzejklimczuk_good_spn.json \ + | shuf \ + | head -n100 \ + | jq . -c \ + | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-priority -p -1 + +cat /srv/sandcrawler/tasks/andrzejklimczuk_good_spn.json \ + | shuf \ + | head -n10000 \ + | jq . -c \ + | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-priority -p -1 + +sudo -u sandcrawler pipenv run \ + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/andrzejklimczuk_bad_spn.rows.json \ + > /srv/sandcrawler/tasks/andrzejklimczuk_bad2_spn.json + +cat /srv/sandcrawler/tasks/andrzejklimczuk_bad2_spn.json \ + | shuf \ + | head -n60000 \ + | jq . -c \ + | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-priority -p -1 |