aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2022-01-27 17:55:15 -0800
committerBryan Newbold <bnewbold@archive.org>2022-01-27 17:55:15 -0800
commitc8e2462471a010e4ae368941b539e9404f3768fc (patch)
tree8b9eaa02fbe7e75a8dfb09f341f77e6d645cc3b9
parent2a96e2baeb7d318a4aa2abbda7052757a02f5167 (diff)
downloadsandcrawler-c8e2462471a010e4ae368941b539e9404f3768fc.tar.gz
sandcrawler-c8e2462471a010e4ae368941b539e9404f3768fc.zip
ingest notes: various in-progress projects
-rw-r--r--notes/ingest/2021-09-03_patch_crawl.md92
-rw-r--r--notes/ingest/2021-12-13_datasets.md398
-rw-r--r--notes/ingest/2022-01-06_patch_crawl.md156
-rw-r--r--notes/ingest/2022-01-13_doi_crawl.md157
4 files changed, 800 insertions, 3 deletions
diff --git a/notes/ingest/2021-09-03_patch_crawl.md b/notes/ingest/2021-09-03_patch_crawl.md
index f63e524..ad69311 100644
--- a/notes/ingest/2021-09-03_patch_crawl.md
+++ b/notes/ingest/2021-09-03_patch_crawl.md
@@ -482,7 +482,93 @@ Note that this is just seedlists, not full ingest requests.
Then run the actual patch crawl!
-## Ingest Requests for Bulk Retry
+## Ingest Requests for Bulk Retry (2022-01-06)
-TODO: for each of the link sources mentioned at top, do a separate query by
-source to re-ingest.
+Crawl has just about completed, so running another round of bulk ingest
+requests, slightly updated to allow `https://doi.org/10*` in terminal URL:
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_file_result.updated <= '2022-01-01'
+ AND (
+ ingest_file_result.status = 'no-capture'
+ OR ingest_file_result.status = 'cdx-error'
+ OR ingest_file_result.status = 'wayback-error'
+ OR ingest_file_result.status = 'wayback-content-error'
+ OR ingest_file_result.status = 'petabox-error'
+ OR ingest_file_result.status = 'spn2-cdx-lookup-failure'
+ OR ingest_file_result.status = 'gateway-timeout'
+ )
+ AND (
+ ingest_request.link_source = 'oai'
+ OR (
+ ingest_request.link_source = 'doi'
+ AND (
+ ingest_request.ingest_request_source = 'fatcat-ingest'
+ OR ingest_request.ingest_request_source = 'fatcat-changelog'
+ )
+ )
+ )
+
+ AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repec:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:n/a:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:quod.lib.umich.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:americanae.aecid.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:www.irgrid.ac.cn:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:espace.library.uq.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:edoc.mpg.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bibliotecadigital.jcyl.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repository.erciyes.edu.tr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:krm.or.kr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hypotheses.org:%'
+
+ AND ingest_file_result.terminal_url NOT LIKE '%mdz-nbn-resolving.de%'
+ AND ingest_file_result.terminal_url NOT LIKE '%edoc.mpg.de%'
+ AND ingest_file_result.terminal_url NOT LIKE '%doaj.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%orcid.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%gateway.isiknowledge.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%europmc.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%arxiv.org%'
+ -- AND ingest_file_result.terminal_url NOT LIKE 'https://doi.org/10.%'
+
+ AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%researchgate.net%'
+ AND ingest_file_result.terminal_url NOT LIKE '%muse.jhu.edu%'
+ AND ingest_file_result.terminal_url NOT LIKE '%omicsonline.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%link.springer.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ieeexplore.ieee.org%'
+
+ AND ingest_file_result.terminal_url NOT LIKE '%zenodo.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%t2r2.star.titech.ac.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.google.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%figshare.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%springernature.figshare.com%'
+ ) TO '/srv/sandcrawler/tasks/patch_ingest_request_2022-01-06.rows.json';
+ => 4,488,193
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/patch_ingest_request_2022-01-06.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/patch_ingest_request_2022-01-06.ingest_request.json
+
+ cat /srv/sandcrawler/tasks/patch_ingest_request_2022-01-06.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+## Stats Again
+
+Re-run the "progress check" stuff from above
diff --git a/notes/ingest/2021-12-13_datasets.md b/notes/ingest/2021-12-13_datasets.md
new file mode 100644
index 0000000..edad789
--- /dev/null
+++ b/notes/ingest/2021-12-13_datasets.md
@@ -0,0 +1,398 @@
+
+First round of production dataset ingest. Aiming to get one or two small
+repositories entirely covered, and a few thousand datasets from all supported
+platforms.
+
+Planning to run with sandcrawler in batch mode on `wbgrp-svc263`, expecting up
+to a TByte of content locally (on spinning disk). For successful output, will
+run through fatcat import; for a subset of unsuccessful, will start a small
+heritrix crawl.
+
+
+## Ingest Generation
+
+Summary:
+
+ wc -l /srv/fatcat/tasks/ingest_dataset_*pilot.json
+ 2 /srv/fatcat/tasks/ingest_dataset_dataverse_archiveorg_pilot.json
+ 1702 /srv/fatcat/tasks/ingest_dataset_dataverse_goettingen_pilot.json
+ 2975 /srv/fatcat/tasks/ingest_dataset_dataverse_harvard_pilot.json
+ 10000 /srv/fatcat/tasks/ingest_dataset_figshare_pilot.json
+ 10000 /srv/fatcat/tasks/ingest_dataset_zenodo_pilot.json
+
+All the below ingest requests were combined into a single large file:
+
+ cat /srv/fatcat/tasks/ingest_dataset*pilot.json | shuf | pv -l | gzip > /srv/fatcat/tasks/ingest_dataset_combined.json.gz
+ # 24.7k 0:00:00 [91.9k/s]
+
+### Figshare
+
+- sample 10k datasets (not other types)
+- want only "versioned" DOIs; use regex on DOI to ensure
+
+ ./fatcat_ingest.py --limit 50000 --ingest-type dataset --allow-non-oa query 'doi_prefix:10.6084 type:dataset' \
+ | rg '10\.6084/m9\.figshare\.\d+.v\d+' \
+ | shuf -n10000 \
+ | pv -l \
+ > /srv/fatcat/tasks/ingest_dataset_figshare_pilot.json
+ # Counter({'estimate': 505968, 'ingest_request': 50000, 'elasticsearch_release': 50000})
+
+### Zenodo
+
+- has DOIs (of course)
+- want only "versioned" DOIs? how to skip?
+- sample 10k
+
+ ./fatcat_ingest.py --limit 50000 --ingest-type dataset --allow-non-oa query 'doi_prefix:10.5281 type:dataset' \
+ | rg '10\.5281/zenodo' \
+ | shuf -n10000 \
+ | pv -l \
+ > /srv/fatcat/tasks/ingest_dataset_zenodo_pilot.json
+
+### Goettingen Research Online
+
+- <https://data.goettingen-research-online.de/>
+- Dataverse instance, not harvard-hosted
+- ~1,400 datasets, ~10,500 files
+- has DOIs
+- `doi_prefix:10.25625`, then filter to only one slash
+
+ ./fatcat_ingest.py --ingest-type dataset --allow-non-oa query 'doi_prefix:10.25625 type:dataset' \
+ | rg -v '10\.25625/[a-z0-9]+/[a-z0-9]' \
+ | shuf \
+ | pv -l \
+ > /srv/fatcat/tasks/ingest_dataset_dataverse_goettingen_pilot.json
+ # Counter({'ingest_request': 12739, 'elasticsearch_release': 12739, 'estimate': 12739}) # 1.7k 0:01:29 [ 19 /s]
+
+### Harvard Dataverse
+
+- main harvard dataverse instance, many "sub-dataverses"
+- ~137,000 datasets, ~1,400,000 files
+- 10k sample
+
+ ./fatcat_ingest.py --limit 50000 --ingest-type dataset --allow-non-oa query 'doi_prefix:10.7910 type:dataset' \
+ | rg '10\.7910/dvn/[a-z0-9]{6}' \
+ | rg -v '10\.7910/dvn/[a-z0-9]{6}/[a-z0-9]' \
+ | shuf -n10000 \
+ | pv -l \
+ > /srv/fatcat/tasks/ingest_dataset_dataverse_harvard_pilot.json
+ # Counter({'estimate': 660979, 'ingest_request': 50000, 'elasticsearch_release': 50000}) # 2.97k 0:03:26 [14.4 /s]
+
+Note that this was fewer than expected, but moving on anyways.
+
+### archive.org
+
+A couple hand-filtered items.
+
+"CAT" dataset
+- item: <https://archive.org/details/CAT_DATASET>
+- fatcat release (for paper): `release_36vy7s5gtba67fmyxlmijpsaui`
+
+"The Representativeness of Automated Web Crawls as a Surrogate for Human Browsing"
+- https://archive.org/details/academictorrents_5e9ef2b5531ce3b965681be6eccab1fbd114af62
+- https://fatcat.wiki/release/7owybd2hrvdmdpm4zpo7hkn2pu (paper)
+
+
+ {
+ "ingest_type": "dataset",
+ "ingest_request_source": "savepapernow",
+ "base_url": "https://archive.org/details/CAT_DATASET",
+ "release_stage": "published",
+ "fatcat": {
+ "release_ident": "36vy7s5gtba67fmyxlmijpsaui",
+ "work_ident": "ycqtbhnfmzamheq2amztiwbsri"
+ },
+ "ext_ids": {},
+ "link_source": "spn",
+ "link_source_id": "36vy7s5gtba67fmyxlmijpsaui"
+ }
+ {
+ "ingest_type": "dataset",
+ "ingest_request_source": "savepapernow",
+ "base_url": "https://archive.org/details/academictorrents_5e9ef2b5531ce3b965681be6eccab1fbd114af62",
+ "release_stage": "published",
+ "fatcat": {
+ "release_ident": "7owybd2hrvdmdpm4zpo7hkn2pu",
+ "work_ident": "3xkz7iffwbdfhbwhnd73iu66cu"
+ },
+ "ext_ids": {},
+ "link_source": "spn",
+ "link_source_id": "7owybd2hrvdmdpm4zpo7hkn2pu"
+ }
+
+ # paste and then Ctrl-D:
+ cat | jq . -c > /srv/fatcat/tasks/ingest_dataset_dataverse_archiveorg_pilot.json
+
+
+## Ingest Command
+
+On `wbgrp-svc263`.
+
+In the current version of tool, `skip_cleanup_local_files=True` by default, so
+files will stick around.
+
+Note that `--no-spn2` is passed, so we are expecting a lot of `no-capture` in the output.
+
+
+ # first a small sample
+ zcat /srv/sandcrawler/tasks/ingest_dataset_combined.json.gz \
+ | head -n5 \
+ | pv -l \
+ | parallel -j4 --linebuffer --round-robin --pipe ./ingest_tool.py requests --no-spn2 - \
+ > /srv/sandcrawler/tasks/ingest_dataset_combined_results.ramp.json
+
+ # ok, run the whole batch through
+ zcat /srv/sandcrawler/tasks/ingest_dataset_combined.json.gz \
+ | pv -l \
+ | parallel -j4 --linebuffer --round-robin --pipe ./ingest_tool.py requests --no-spn2 - \
+ > /srv/sandcrawler/tasks/ingest_dataset_combined_results.json
+
+Got an error:
+
+ internetarchive.exceptions.AuthenticationError: No access_key or secret_key set! Have you run `ia configure`?
+
+Did a hot patch to try to have the uploads happen under a session, with config from ENV, but didn't work:
+
+ AttributeError: 'ArchiveSession' object has no attribute 'upload'
+
+Going to hack with config in homedir for now.
+
+Extract URLs for crawling:
+
+ cat /srv/sandcrawler/tasks/ingest_dataset_combined_results*.json \
+ | rg '"no-capture"' \
+ | rg -v '"manifest"' \
+ | jq 'select(.status = "no-capture")' -c \
+ | jq .request.base_url -r \
+ | pv -l \
+ > /srv/sandcrawler/tasks/dataset_seedlist.base_url.txt
+
+ cat /srv/sandcrawler/tasks/ingest_dataset_combined_results*.json \
+ | rg '"no-capture"' \
+ | rg '"manifest"' \
+ | jq 'select(.status = "no-capture")' -c \
+ | rg '"web-' \
+ | jq .manifest[].terminal_url -r \
+ | pv -l \
+ > /srv/sandcrawler/tasks/dataset_seedlist.manifest_terminal.txt
+
+### Exceptions Encountered
+
+ File "/srv/sandcrawler/src/python/sandcrawler/fileset_strategies.py", line 193, in process
+ internetarchive.upload
+ [...]
+ ConnectionResetError: [Errno 104] Connection reset by peer
+ urllib3.exceptions.ProtocolError
+ requests.exceptions.ConnectionError: (ProtocolError('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer')), 'https://s3.us.archive.org/zenodo.org-3275525/rhOverM_Asymptotic_GeometricUnits_CoM.h5')
+
+
+ Traceback (most recent call last):
+ File "./ingest_tool.py", line 208, in <module>
+ main()
+ File "./ingest_tool.py", line 204, in main
+ args.func(args)
+ File "./ingest_tool.py", line 57, in run_requests
+ result = fileset_worker.process(request)
+ File "/srv/sandcrawler/src/python/sandcrawler/ingest_fileset.py", line 375, in process
+ archive_result = strategy_helper.process(dataset_meta)
+ File "/srv/sandcrawler/src/python/sandcrawler/fileset_strategies.py", line 130, in process
+ r.raise_for_status()
+ File "/srv/sandcrawler/src/python/.venv/lib/python3.8/site-packages/requests/models.py", line 953, in raise_for_status
+ raise HTTPError(http_error_msg, response=self)
+ requests.exceptions.HTTPError: 404 Client Error: Not Found for url: https://ndownloader.figshare.com/files/5474201
+
+download sometimes just slowly time out, like after a day or more
+
+
+ Traceback (most recent call last):
+ File "./ingest_tool.py", line 208, in <module>
+ main()
+ File "./ingest_tool.py", line 204, in main
+ args.func(args)
+ File "./ingest_tool.py", line 57, in run_requests
+ result = fileset_worker.process(request)
+ File "/srv/sandcrawler/src/python/sandcrawler/ingest_fileset.py", line 381, in process
+ archive_result = strategy_helper.process(dataset_meta)
+ File "/srv/sandcrawler/src/python/sandcrawler/fileset_strategies.py", line 155, in process
+ file_meta = gen_file_metadata_path(local_path, allow_empty=True)
+ File "/srv/sandcrawler/src/python/sandcrawler/misc.py", line 89, in gen_file_metadata_path
+ mimetype = magic.Magic(mime=True).from_file(path)
+ File "/srv/sandcrawler/src/python/.venv/lib/python3.8/site-packages/magic/__init__.py", line 111, in from_file
+ with _real_open(filename):
+ FileNotFoundError: [Errno 2] No such file or directory: '/tmp/sandcrawler/figshare.com-7925396-v1/HG02070.dedup.realigned.recalibrated.hc.g.vcf.gz'
+
+
+ Traceback (most recent call last):
+ File "./ingest_tool.py", line 208, in <module>
+ main()
+ File "./ingest_tool.py", line 204, in main
+ args.func(args)
+ File "./ingest_tool.py", line 57, in run_requests
+ result = fileset_worker.process(request)
+ File "/srv/sandcrawler/src/python/sandcrawler/ingest_fileset.py", line 314, in process
+ dataset_meta = platform_helper.process_request(request, resource, html_biblio)
+ File "/srv/sandcrawler/src/python/sandcrawler/fileset_platforms.py", line 208, in process_request
+ obj_latest = obj["data"]["latestVersion"]
+ KeyError: 'latestVersion'
+
+Fixed the above, trying again:
+
+ git log | head -n1
+ # commit ffdc901fa067db55fe6cfeb8d0c3807d29df092c
+
+ Wed Dec 15 21:57:42 UTC 2021
+
+ zcat /srv/sandcrawler/tasks/ingest_dataset_combined.json.gz \
+ | shuf \
+ | parallel -j4 --linebuffer --round-robin --pipe ./ingest_tool.py requests --no-spn2 --enable-sentry - \
+ | pv -l \
+ > /srv/sandcrawler/tasks/ingest_dataset_combined_results4.json
+
+Zenodo seems really slow, let's try filtering those out:
+
+ zcat /srv/sandcrawler/tasks/ingest_dataset_combined.json.gz \
+ | rg -v 10.5281 \
+ | shuf \
+ | parallel -j8 --linebuffer --round-robin --pipe ./ingest_tool.py requests --no-spn2 --enable-sentry - \
+ | pv -l \
+ > /srv/sandcrawler/tasks/ingest_dataset_combined_results5.json
+ # 3.76k 15:12:53 [68.7m/s]
+
+ zcat /srv/sandcrawler/tasks/ingest_dataset_combined.json.gz \
+ | rg -v 10.5281 \
+ | shuf \
+ | parallel -j8 --linebuffer --round-robin --pipe ./ingest_tool.py requests --no-spn2 --enable-sentry - \
+ | pv -l \
+ > /srv/sandcrawler/tasks/ingest_dataset_combined_results6.json
+
+## Fatcat Import
+
+ wc -l ingest_dataset_combined_results*.json
+ 126 ingest_dataset_combined_results2.json
+ 153 ingest_dataset_combined_results3.json
+ 275 ingest_dataset_combined_results4.json
+ 3762 ingest_dataset_combined_results5.json
+ 7736 ingest_dataset_combined_results6.json
+ 182 ingest_dataset_combined_results.json
+ 5 ingest_dataset_combined_results.ramp.json
+ 12239 total
+
+ cat ingest_dataset_combined_results*.json \
+ | rg '^\{' \
+ | jq '[.request.fatcat.release_ident, . | tostring] | @tsv' -r \
+ | sort \
+ | uniq --check-chars 26 \
+ | cut -f2 \
+ | rg -v '\\\\' \
+ | pv -l \
+ > uniq_ingest_dataset_combined_results.json
+ # 9.48k 0:00:06 [1.54k/s]
+
+ cat uniq_ingest_dataset_combined_results.json | jq .status -r | sort | uniq -c | sort -nr
+ 7941 no-capture
+ 374 platform-404
+ 369 terminal-bad-status
+ 348 success-file
+ 172 success
+ 79 platform-scope
+ 77 error-platform-download
+ 47 empty-manifest
+ 27 platform-restricted
+ 20 too-many-files
+ 12 redirect-loop
+ 6 error-archiveorg-upload
+ 3 too-large-size
+ 3 mismatch
+ 1 no-platform-match
+
+ cat uniq_ingest_dataset_combined_results.json \
+ | rg '"success' \
+ | jq 'select(.status == "success") | .' -c \
+ > uniq_ingest_dataset_combined_results.success.json
+
+ cat uniq_ingest_dataset_combined_results.json \
+ | rg '"success' \
+ | jq 'select(.status == "success-file") | .' -c \
+ > uniq_ingest_dataset_combined_results.success-file.json
+
+On fatcat QA instance:
+
+ git log | head -n1
+ # commit cca680e2cc4768a4d45e199f6256a433b25b4075
+
+ head /tmp/uniq_ingest_dataset_combined_results.success-file.json \
+ | ./fatcat_import.py ingest-fileset-results -
+ # Counter({'total': 10, 'skip': 10, 'skip-single-file': 10, 'insert': 0, 'update': 0, 'exists': 0})
+
+ head /tmp/uniq_ingest_dataset_combined_results.success-file.json \
+ | ./fatcat_import.py ingest-file-results -
+ # Counter({'total': 10, 'skip': 10, 'skip-ingest-type': 10, 'insert': 0, 'update': 0, 'exists': 0})
+
+Need to update fatcat file worker to support single-file filesets... was that the plan?
+
+ head /tmp/uniq_ingest_dataset_combined_results.success.json \
+ | ./fatcat_import.py ingest-fileset-results -
+ # Counter({'total': 10, 'skip': 10, 'skip-no-access-url': 10, 'insert': 0, 'update': 0, 'exists': 0})
+
+ # Counter({'total': 10, 'insert': 10, 'skip': 0, 'update': 0, 'exists': 0})
+
+
+## Summary
+
+As follow-up, it may be worth doing another manual round of ingest requests.
+After that, would be good to fill in "glue" code so that this can be done with
+kafka workers, and do re-tries/dumps using sandcrawler SQL database. Then can
+start scaling up more ingest, using ingest tool, "bulk mode" processing,
+heritrix crawls from `no-capture` dumps, etc, similar to bulk file ingest
+process.
+
+For scaling, let's do a "full" ingest request generation of all datasets, and
+crawl the base URL with heritrix, in fast/direct mode. Expect this to be tens
+of millions of mostly DOIs (doi.org URLs), should crawl quickly.
+
+Then, do bulk downloading with ingest worker, perhaps on misc-vm or aitio.
+uploading large datasets to archive.org, but not doing SPN web requests. Feed
+the resulting huge file seedlist into a heritrix crawl to download web files.
+
+Will need to add support for more specific platforms.
+
+
+### Huge Bulk Ingest Prep
+
+On prod instance:
+
+ ./fatcat_ingest.py --ingest-type dataset --allow-non-oa query type:dataset \
+ | pv -l \
+ | gzip \
+ > /srv/fatcat/tasks/ingest_dataset_bulk.2022-01-05.json.gz
+ # Expecting 11264787 release objects in search queries
+ # TIMEOUT ERROR
+ # 6.07M 19:13:02 [87.7 /s] (partial)
+
+As follow-up, should do a full batch (not partial). For now search index is too
+unreliable (read timeouts).
+
+ zcat ingest_dataset_bulk.2022-01-05.partial.json.gz \
+ | jq .base_url -r \
+ | sort -u \
+ | shuf \
+ | awk '{print "F+ " $1}' \
+ > ingest_dataset_bulk.2022-01-05.partial.schedule
+
+## Retries (2022-01-12)
+
+This is after having done a bunch of crawling.
+
+ cat ingest_dataset_combined_results6.json \
+ | rg '"no-capture"' \
+ | jq 'select(.status = "no-capture")' -c \
+ | jq .request -c \
+ | pv -l \
+ > ingest_dataset_retry.json
+ => 6.51k 0:00:01 [3.55k/s]
+
+ cat /srv/sandcrawler/tasks/ingest_dataset_retry.json \
+ | parallel -j4 --linebuffer --round-robin --pipe ./ingest_tool.py requests --no-spn2 --enable-sentry - \
+ | pv -l \
+ > /srv/sandcrawler/tasks/ingest_dataset_retry_results.json
+
diff --git a/notes/ingest/2022-01-06_patch_crawl.md b/notes/ingest/2022-01-06_patch_crawl.md
new file mode 100644
index 0000000..ffd6669
--- /dev/null
+++ b/notes/ingest/2022-01-06_patch_crawl.md
@@ -0,0 +1,156 @@
+
+Starting another paper fulltext patch crawl, targetting recent OA content which
+has failed to ingest, and platforms (arxiv, etc).
+
+Specifically:
+
+- "daily" changelog ingest requests from all time, which failed with various status codes
+- pdf no-capture
+- SPN errors
+- terminal-bad-status with 5xx, 429
+- gateway-timeout
+- html no-capture
+- html-resource-no-capture
+
+Most of these are dumped in a single complex query (below),
+
+TODO: html-resource-no-capture (from error message? or do SPN requests separately?)
+
+
+## Initial 'no-capture' Seedlist
+
+Dump terminal URLs (will do ingest requests later, using similar command):
+
+ COPY (
+ SELECT ingest_file_result.terminal_url
+ -- SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ (
+ ingest_request.ingest_type = 'pdf'
+ OR ingest_request.ingest_type = 'html'
+ )
+ AND (
+ ingest_file_result.status = 'no-capture'
+ OR ingest_file_result.status = 'cdx-error'
+ OR ingest_file_result.status = 'wayback-error'
+ OR ingest_file_result.status = 'wayback-content-error'
+ OR ingest_file_result.status = 'petabox-error'
+ OR ingest_file_result.status = 'spn2-cdx-lookup-failure'
+ OR ingest_file_result.status = 'gateway-timeout'
+ OR (
+ ingest_file_result.status = 'terminal-bad-status'
+ AND (
+ ingest_file_result.terminal_status_code = 429
+ OR ingest_file_result.terminal_status_code = 500
+ OR ingest_file_result.terminal_status_code = 502
+ OR ingest_file_result.terminal_status_code = 503
+ )
+ )
+ )
+ AND (
+ ingest_request.link_source = 'oai'
+ OR ingest_request.link_source = 'doi'
+ OR ingest_request.link_source = 'arxiv'
+ OR ingest_request.link_source = 'doaj'
+ OR ingest_request.link_source = 'unpaywall'
+ OR ingest_request.link_source = 'pmc'
+ )
+
+ AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repec:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:n/a:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:quod.lib.umich.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:americanae.aecid.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:www.irgrid.ac.cn:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:espace.library.uq.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:edoc.mpg.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bibliotecadigital.jcyl.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repository.erciyes.edu.tr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:krm.or.kr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hypotheses.org:%'
+
+ AND ingest_file_result.terminal_url NOT LIKE '%mdz-nbn-resolving.de%'
+ AND ingest_file_result.terminal_url NOT LIKE '%edoc.mpg.de%'
+ AND ingest_file_result.terminal_url NOT LIKE '%doaj.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%orcid.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%gateway.isiknowledge.com%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%europmc.org%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%arxiv.org%'
+ -- AND ingest_file_result.terminal_url NOT LIKE 'https://doi.org/10.%'
+
+ AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%researchgate.net%'
+ AND ingest_file_result.terminal_url NOT LIKE '%muse.jhu.edu%'
+ AND ingest_file_result.terminal_url NOT LIKE '%omicsonline.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%link.springer.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ieeexplore.ieee.org%'
+
+ -- AND ingest_file_result.terminal_url NOT LIKE '%zenodo.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%t2r2.star.titech.ac.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.google.com%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%figshare.com%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%springernature.figshare.com%'
+ -- ) TO '/srv/sandcrawler/tasks/patch_ingest_request_2022-01-12.rows.json';
+ ) TO '/srv/sandcrawler/tasks/patch_terminal_url.2022-01-12.txt';
+ => COPY 6389683
+
+TODO: filter out archive.org/www.archive.org
+
+ cat patch_terminal_url.2022-01-12.txt \
+ | rg -v www.archive.org \
+ | rg '://' \
+ | rg -v '://10\.' \
+ | rg -v '://172\.' \
+ | rg -i '^http' \
+ | sort -u -S 4G \
+ | pv -l \
+ > patch_terminal_url.2022-01-12.uniq.txt
+ => 5.73M 0:00:47 [ 120k/s]
+
+ # note: tweaks and re-ran the above after inspecting this output
+ cut -f3 -d/ patch_terminal_url.2022-01-12.uniq.txt | sort | uniq -c | sort -nr | head -n25
+ 799045 doi.org
+ 317557 linkinghub.elsevier.com
+ 211091 arxiv.org
+ 204334 iopscience.iop.org
+ 139758 dialnet.unirioja.es
+ 130331 www.scielo.br
+ 124626 www.persee.fr
+ 85764 digitalrepository.unm.edu
+ 83913 www.mdpi.com
+ 79662 www.degruyter.com
+ 75703 www.e-periodica.ch
+ 72206 dx.doi.org
+ 69068 escholarship.org
+ 67848 idus.us.es
+ 57907 zenodo.org
+ 56624 ir.opt.ac.cn
+ 54983 projecteuclid.org
+ 52226 rep.bntu.by
+ 48376 osf.io
+ 48009 pubs.rsc.org
+ 46947 publikationen.ub.uni-frankfurt.de
+ 45564 www.research-collection.ethz.ch
+ 45153 dk.um.si
+ 43313 www.ssoar.info
+ 40543 scholarworks.umt.edu
+
+TODO: cleanup ingest request table in sandcrawler-db:
+- remove filtered OAI-PMH prefixes
+- remove any invalid `base_url` (?)
diff --git a/notes/ingest/2022-01-13_doi_crawl.md b/notes/ingest/2022-01-13_doi_crawl.md
new file mode 100644
index 0000000..6f3b2c8
--- /dev/null
+++ b/notes/ingest/2022-01-13_doi_crawl.md
@@ -0,0 +1,157 @@
+
+Could roll this in to current patch crawl instead of starting a new crawl from scratch.
+
+## KBART "almost complete" experimentation
+
+Random 10 releases:
+
+ cat missing_releases.json | shuf -n10 | jq .ident -r | awk '{print "https://fatcat.wiki/release/" $1}'
+ https://fatcat.wiki/release/suggmo4fnfaave64frttaqqoja - domain gone
+ https://fatcat.wiki/release/uw2dq2p3mzgolk4alze2smv7bi - DOAJ, then OJS PDF link. sandcrawler failed, fixed
+ https://fatcat.wiki/release/fjamhzxxdndq5dcariobxvxu3u - OJS; sandcrawler fix works
+ https://fatcat.wiki/release/z3ubnko5ifcnbhhlegc24kya2u - OJS; sandcrawler failed, fixed (separate pattern)
+ https://fatcat.wiki/release/pysc3w2cdbehvffbyca4aqex3i - DOAJ, OJS bilingual, failed with 'redirect-loop'. force re-crawl worked for one copy
+ https://fatcat.wiki/release/am2m5agvjrbvnkstke3o3xtney - not attempted previously (?), success
+ https://fatcat.wiki/release/4zer6m56zvh6fd3ukpypdu7ita - cover page of journal (not an article). via crossref
+ https://fatcat.wiki/release/6njc4rdaifbg5jye3bbfdhkbsu - OJS; success
+ https://fatcat.wiki/release/jnmip3z7xjfsdfeex4piveshvu - OJS; not crawled previously; success
+ https://fatcat.wiki/release/wjxxcknnpjgtnpbzhzge6rkndi - no-pdf-link, fixed
+
+Try some more!
+
+ https://fatcat.wiki/release/ywidvbhtfbettmfj7giu2htbdm - not attempted, success
+ https://fatcat.wiki/release/ou2kqv5k3rbk7iowfohpitelfa - OJS, not attempted, success?
+ https://fatcat.wiki/release/gv2glplmofeqrlrvfs524v5qa4 - scirp.org; 'redirect-loop'; HTML/PDF/XML all available; then 'gateway-timeout' on retry
+ https://fatcat.wiki/release/5r5wruxyyrf6jneorux3negwpe - gavinpublishers.com; broken site
+ https://fatcat.wiki/release/qk4atst6svg4hb73jdwacjcacu - horyzonty.ignatianum.edu.pl; broken DOI
+ https://fatcat.wiki/release/mp5ec3ycrjauxeve4n4weq7kqm - old cert; OJS; success
+ https://fatcat.wiki/release/sqnovcsmizckjdlwg3hipxrfqm - not attempted, success
+ https://fatcat.wiki/release/42ruewjuvbblxgnek6fpj5lp5m - OJS URL, but domain broken
+ https://fatcat.wiki/release/crg6aiypx5enveldvmwy5judp4 - volume/cover (stub)
+ https://fatcat.wiki/release/jzih3vvxj5ctxk3tbzyn5kokha - success
+
+
+## Seeds: fixed OJS URLs
+
+Made some recent changes to sandcrawler, should re-attempt OJS URLs, particularly from DOI or DOAJ, with pattern like:
+
+- `no-pdf-link` with terminal URL like `/article/view/`
+- `redirect-loop` with terminal URL like `/article/view/`
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_file_result.status = 'no-pdf-link'
+ AND (
+ ingest_file_result.terminal_url LIKE '%/article/view/%'
+ OR ingest_file_result.terminal_url LIKE '%/article/download/%'
+ )
+ AND (
+ ingest_request.link_source = 'doi'
+ OR ingest_request.link_source = 'doaj'
+ OR ingest_request.link_source = 'unpaywall'
+ )
+ ) TO '/srv/sandcrawler/tasks/retry_ojs_nopdflink.2022-01-13.rows.json';
+ => COPY 326577
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/retry_ojs_nopdflink.2022-01-13.rows.json > /srv/sandcrawler/tasks/retry_ojs_nopdflink.2022-01-13.json
+ cat /srv/sandcrawler/tasks/retry_ojs_nopdflink.2022-01-13.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Done/running.
+
+ COPY (
+ SELECT ingest_file_result.terminal_url
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND (
+ ingest_file_result.status = 'redirect-loop'
+ OR ingest_file_result.status = 'link-loop'
+ )
+ AND (
+ ingest_file_result.terminal_url LIKE '%/article/view/%'
+ OR ingest_file_result.terminal_url LIKE '%/article/download/%'
+ )
+ ) TO '/srv/sandcrawler/tasks/retry_ojs_loop.2022-01-13.txt';
+ => COPY 342415
+
+ cat /srv/sandcrawler/tasks/retry_ojs_loop.2022-01-13.txt | awk '{print "F+ " $1}' > /srv/sandcrawler/tasks/retry_ojs_loop.2022-01-13.schedule
+
+Done/seeded.
+
+## Seeds: scitemed.com
+
+Batch retry sandcrawler `no-pdf-link` with terminal URL like: `scitemed.com/article`
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_file_result.status = 'no-pdf-link'
+ AND ingest_file_result.terminal_url LIKE '%/article/view/%'
+ AND (
+ ingest_request.link_source = 'doi'
+ OR ingest_request.link_source = 'doaj'
+ OR ingest_request.link_source = 'unpaywall'
+ )
+ ) TO '/srv/sandcrawler/tasks/retry_scitemed.2022-01-13.rows.json';
+ # SKIPPED
+
+Actually there are very few of these.
+
+## Seeds: non-OA paper DOIs
+
+There are many DOIs out there which are likely to be from small publishers, on
+the web, and would ingest just fine (eg, in OJS).
+
+ fatcat-cli search release in_ia:false is_oa:false 'doi:*' release_type:article-journal 'container_id:*' '!publisher_type:big5' --count
+ 30,938,106
+
+ fatcat-cli search release in_ia:false is_oa:false 'doi:*' release_type:article-journal 'container_id:*' '!publisher_type:big5' 'preservation:none' --count
+ 6,664,347
+
+ fatcat-cli search release in_ia:false is_oa:false 'doi:*' release_type:article-journal 'container_id:*' '!publisher_type:big5' 'in_kbart:false' --count
+ 8,258,111
+
+Do the 8 million first, then maybe try the 30.9 million later? Do sampling to
+see how many are actually accessible? From experience with KBART generation,
+many of these are likely to crawl successfully.
+
+ ./fatcat_ingest.py --ingest-type pdf --allow-non-oa query 'in_ia:false is_oa:false doi:* release_type:article-journal container_id:* !publisher_type:big5 in_kbart:false' \
+ | pv -l \
+ | gzip \
+ > /srv/fatcat/tasks/ingest_nonoa_doi.json.gz
+ # Expecting 8255693 release objects in search queries
+
+## Seeds: not daily, but OA DOI
+
+There are a bunch of things we are no longer attempting daily, but should do
+heritrix crawls of periodically.
+
+TODO: maybe in daily crawling, should check container coverage and see if most URLs are bright, and if so do ingest? hrm
+TODO: What are they? zenodo.org?
+
+## Seeds: HTML and XML links from HTML biblio
+
+ kafkacat -C -b wbgrp-svc284.us.archive.org:9092 -t sandcrawler-prod.ingest-file-results -e \
+ | pv -l \
+ | rg '"(html|xml)_fulltext_url"' \
+ | rg '"no-pdf-link"' \
+ | gzip \
+ > ingest_file_result_fulltext_urls.2022-01-13.json.gz
+
+## Seeds: most doi.org terminal non-success
+
+Unless it is a 404, should retry.