diff options
author | Bryan Newbold <bnewbold@archive.org> | 2022-01-27 17:55:15 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2022-01-27 17:55:15 -0800 |
commit | c8e2462471a010e4ae368941b539e9404f3768fc (patch) | |
tree | 8b9eaa02fbe7e75a8dfb09f341f77e6d645cc3b9 /notes | |
parent | 2a96e2baeb7d318a4aa2abbda7052757a02f5167 (diff) | |
download | sandcrawler-c8e2462471a010e4ae368941b539e9404f3768fc.tar.gz sandcrawler-c8e2462471a010e4ae368941b539e9404f3768fc.zip |
ingest notes: various in-progress projects
Diffstat (limited to 'notes')
-rw-r--r-- | notes/ingest/2021-09-03_patch_crawl.md | 92 | ||||
-rw-r--r-- | notes/ingest/2021-12-13_datasets.md | 398 | ||||
-rw-r--r-- | notes/ingest/2022-01-06_patch_crawl.md | 156 | ||||
-rw-r--r-- | notes/ingest/2022-01-13_doi_crawl.md | 157 |
4 files changed, 800 insertions, 3 deletions
diff --git a/notes/ingest/2021-09-03_patch_crawl.md b/notes/ingest/2021-09-03_patch_crawl.md index f63e524..ad69311 100644 --- a/notes/ingest/2021-09-03_patch_crawl.md +++ b/notes/ingest/2021-09-03_patch_crawl.md @@ -482,7 +482,93 @@ Note that this is just seedlists, not full ingest requests. Then run the actual patch crawl! -## Ingest Requests for Bulk Retry +## Ingest Requests for Bulk Retry (2022-01-06) -TODO: for each of the link sources mentioned at top, do a separate query by -source to re-ingest. +Crawl has just about completed, so running another round of bulk ingest +requests, slightly updated to allow `https://doi.org/10*` in terminal URL: + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_file_result.updated <= '2022-01-01' + AND ( + ingest_file_result.status = 'no-capture' + OR ingest_file_result.status = 'cdx-error' + OR ingest_file_result.status = 'wayback-error' + OR ingest_file_result.status = 'wayback-content-error' + OR ingest_file_result.status = 'petabox-error' + OR ingest_file_result.status = 'spn2-cdx-lookup-failure' + OR ingest_file_result.status = 'gateway-timeout' + ) + AND ( + ingest_request.link_source = 'oai' + OR ( + ingest_request.link_source = 'doi' + AND ( + ingest_request.ingest_request_source = 'fatcat-ingest' + OR ingest_request.ingest_request_source = 'fatcat-changelog' + ) + ) + ) + + AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%' + AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%' + AND ingest_request.link_source_id NOT LIKE 'oai:repec:%' + AND ingest_request.link_source_id NOT LIKE 'oai:n/a:%' + AND ingest_request.link_source_id NOT LIKE 'oai:quod.lib.umich.edu:%' + AND ingest_request.link_source_id NOT LIKE 'oai:americanae.aecid.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:www.irgrid.ac.cn:%' + AND ingest_request.link_source_id NOT LIKE 'oai:espace.library.uq.edu:%' + AND ingest_request.link_source_id NOT LIKE 'oai:edoc.mpg.de:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bibliotecadigital.jcyl.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:repository.erciyes.edu.tr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:krm.or.kr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hypotheses.org:%' + + AND ingest_file_result.terminal_url NOT LIKE '%mdz-nbn-resolving.de%' + AND ingest_file_result.terminal_url NOT LIKE '%edoc.mpg.de%' + AND ingest_file_result.terminal_url NOT LIKE '%doaj.org%' + AND ingest_file_result.terminal_url NOT LIKE '%orcid.org%' + AND ingest_file_result.terminal_url NOT LIKE '%gateway.isiknowledge.com%' + AND ingest_file_result.terminal_url NOT LIKE '%europmc.org%' + AND ingest_file_result.terminal_url NOT LIKE '%arxiv.org%' + -- AND ingest_file_result.terminal_url NOT LIKE 'https://doi.org/10.%' + + AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%' + AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%' + AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%' + AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%' + AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%' + AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%' + AND ingest_file_result.terminal_url NOT LIKE '%researchgate.net%' + AND ingest_file_result.terminal_url NOT LIKE '%muse.jhu.edu%' + AND ingest_file_result.terminal_url NOT LIKE '%omicsonline.org%' + AND ingest_file_result.terminal_url NOT LIKE '%link.springer.com%' + AND ingest_file_result.terminal_url NOT LIKE '%ieeexplore.ieee.org%' + + AND ingest_file_result.terminal_url NOT LIKE '%zenodo.org%' + AND ingest_file_result.terminal_url NOT LIKE '%t2r2.star.titech.ac.jp%' + AND ingest_file_result.terminal_url NOT LIKE '%www.google.com%' + AND ingest_file_result.terminal_url NOT LIKE '%figshare.com%' + AND ingest_file_result.terminal_url NOT LIKE '%springernature.figshare.com%' + ) TO '/srv/sandcrawler/tasks/patch_ingest_request_2022-01-06.rows.json'; + => 4,488,193 + + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/patch_ingest_request_2022-01-06.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/patch_ingest_request_2022-01-06.ingest_request.json + + cat /srv/sandcrawler/tasks/patch_ingest_request_2022-01-06.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +## Stats Again + +Re-run the "progress check" stuff from above diff --git a/notes/ingest/2021-12-13_datasets.md b/notes/ingest/2021-12-13_datasets.md new file mode 100644 index 0000000..edad789 --- /dev/null +++ b/notes/ingest/2021-12-13_datasets.md @@ -0,0 +1,398 @@ + +First round of production dataset ingest. Aiming to get one or two small +repositories entirely covered, and a few thousand datasets from all supported +platforms. + +Planning to run with sandcrawler in batch mode on `wbgrp-svc263`, expecting up +to a TByte of content locally (on spinning disk). For successful output, will +run through fatcat import; for a subset of unsuccessful, will start a small +heritrix crawl. + + +## Ingest Generation + +Summary: + + wc -l /srv/fatcat/tasks/ingest_dataset_*pilot.json + 2 /srv/fatcat/tasks/ingest_dataset_dataverse_archiveorg_pilot.json + 1702 /srv/fatcat/tasks/ingest_dataset_dataverse_goettingen_pilot.json + 2975 /srv/fatcat/tasks/ingest_dataset_dataverse_harvard_pilot.json + 10000 /srv/fatcat/tasks/ingest_dataset_figshare_pilot.json + 10000 /srv/fatcat/tasks/ingest_dataset_zenodo_pilot.json + +All the below ingest requests were combined into a single large file: + + cat /srv/fatcat/tasks/ingest_dataset*pilot.json | shuf | pv -l | gzip > /srv/fatcat/tasks/ingest_dataset_combined.json.gz + # 24.7k 0:00:00 [91.9k/s] + +### Figshare + +- sample 10k datasets (not other types) +- want only "versioned" DOIs; use regex on DOI to ensure + + ./fatcat_ingest.py --limit 50000 --ingest-type dataset --allow-non-oa query 'doi_prefix:10.6084 type:dataset' \ + | rg '10\.6084/m9\.figshare\.\d+.v\d+' \ + | shuf -n10000 \ + | pv -l \ + > /srv/fatcat/tasks/ingest_dataset_figshare_pilot.json + # Counter({'estimate': 505968, 'ingest_request': 50000, 'elasticsearch_release': 50000}) + +### Zenodo + +- has DOIs (of course) +- want only "versioned" DOIs? how to skip? +- sample 10k + + ./fatcat_ingest.py --limit 50000 --ingest-type dataset --allow-non-oa query 'doi_prefix:10.5281 type:dataset' \ + | rg '10\.5281/zenodo' \ + | shuf -n10000 \ + | pv -l \ + > /srv/fatcat/tasks/ingest_dataset_zenodo_pilot.json + +### Goettingen Research Online + +- <https://data.goettingen-research-online.de/> +- Dataverse instance, not harvard-hosted +- ~1,400 datasets, ~10,500 files +- has DOIs +- `doi_prefix:10.25625`, then filter to only one slash + + ./fatcat_ingest.py --ingest-type dataset --allow-non-oa query 'doi_prefix:10.25625 type:dataset' \ + | rg -v '10\.25625/[a-z0-9]+/[a-z0-9]' \ + | shuf \ + | pv -l \ + > /srv/fatcat/tasks/ingest_dataset_dataverse_goettingen_pilot.json + # Counter({'ingest_request': 12739, 'elasticsearch_release': 12739, 'estimate': 12739}) # 1.7k 0:01:29 [ 19 /s] + +### Harvard Dataverse + +- main harvard dataverse instance, many "sub-dataverses" +- ~137,000 datasets, ~1,400,000 files +- 10k sample + + ./fatcat_ingest.py --limit 50000 --ingest-type dataset --allow-non-oa query 'doi_prefix:10.7910 type:dataset' \ + | rg '10\.7910/dvn/[a-z0-9]{6}' \ + | rg -v '10\.7910/dvn/[a-z0-9]{6}/[a-z0-9]' \ + | shuf -n10000 \ + | pv -l \ + > /srv/fatcat/tasks/ingest_dataset_dataverse_harvard_pilot.json + # Counter({'estimate': 660979, 'ingest_request': 50000, 'elasticsearch_release': 50000}) # 2.97k 0:03:26 [14.4 /s] + +Note that this was fewer than expected, but moving on anyways. + +### archive.org + +A couple hand-filtered items. + +"CAT" dataset +- item: <https://archive.org/details/CAT_DATASET> +- fatcat release (for paper): `release_36vy7s5gtba67fmyxlmijpsaui` + +"The Representativeness of Automated Web Crawls as a Surrogate for Human Browsing" +- https://archive.org/details/academictorrents_5e9ef2b5531ce3b965681be6eccab1fbd114af62 +- https://fatcat.wiki/release/7owybd2hrvdmdpm4zpo7hkn2pu (paper) + + + { + "ingest_type": "dataset", + "ingest_request_source": "savepapernow", + "base_url": "https://archive.org/details/CAT_DATASET", + "release_stage": "published", + "fatcat": { + "release_ident": "36vy7s5gtba67fmyxlmijpsaui", + "work_ident": "ycqtbhnfmzamheq2amztiwbsri" + }, + "ext_ids": {}, + "link_source": "spn", + "link_source_id": "36vy7s5gtba67fmyxlmijpsaui" + } + { + "ingest_type": "dataset", + "ingest_request_source": "savepapernow", + "base_url": "https://archive.org/details/academictorrents_5e9ef2b5531ce3b965681be6eccab1fbd114af62", + "release_stage": "published", + "fatcat": { + "release_ident": "7owybd2hrvdmdpm4zpo7hkn2pu", + "work_ident": "3xkz7iffwbdfhbwhnd73iu66cu" + }, + "ext_ids": {}, + "link_source": "spn", + "link_source_id": "7owybd2hrvdmdpm4zpo7hkn2pu" + } + + # paste and then Ctrl-D: + cat | jq . -c > /srv/fatcat/tasks/ingest_dataset_dataverse_archiveorg_pilot.json + + +## Ingest Command + +On `wbgrp-svc263`. + +In the current version of tool, `skip_cleanup_local_files=True` by default, so +files will stick around. + +Note that `--no-spn2` is passed, so we are expecting a lot of `no-capture` in the output. + + + # first a small sample + zcat /srv/sandcrawler/tasks/ingest_dataset_combined.json.gz \ + | head -n5 \ + | pv -l \ + | parallel -j4 --linebuffer --round-robin --pipe ./ingest_tool.py requests --no-spn2 - \ + > /srv/sandcrawler/tasks/ingest_dataset_combined_results.ramp.json + + # ok, run the whole batch through + zcat /srv/sandcrawler/tasks/ingest_dataset_combined.json.gz \ + | pv -l \ + | parallel -j4 --linebuffer --round-robin --pipe ./ingest_tool.py requests --no-spn2 - \ + > /srv/sandcrawler/tasks/ingest_dataset_combined_results.json + +Got an error: + + internetarchive.exceptions.AuthenticationError: No access_key or secret_key set! Have you run `ia configure`? + +Did a hot patch to try to have the uploads happen under a session, with config from ENV, but didn't work: + + AttributeError: 'ArchiveSession' object has no attribute 'upload' + +Going to hack with config in homedir for now. + +Extract URLs for crawling: + + cat /srv/sandcrawler/tasks/ingest_dataset_combined_results*.json \ + | rg '"no-capture"' \ + | rg -v '"manifest"' \ + | jq 'select(.status = "no-capture")' -c \ + | jq .request.base_url -r \ + | pv -l \ + > /srv/sandcrawler/tasks/dataset_seedlist.base_url.txt + + cat /srv/sandcrawler/tasks/ingest_dataset_combined_results*.json \ + | rg '"no-capture"' \ + | rg '"manifest"' \ + | jq 'select(.status = "no-capture")' -c \ + | rg '"web-' \ + | jq .manifest[].terminal_url -r \ + | pv -l \ + > /srv/sandcrawler/tasks/dataset_seedlist.manifest_terminal.txt + +### Exceptions Encountered + + File "/srv/sandcrawler/src/python/sandcrawler/fileset_strategies.py", line 193, in process + internetarchive.upload + [...] + ConnectionResetError: [Errno 104] Connection reset by peer + urllib3.exceptions.ProtocolError + requests.exceptions.ConnectionError: (ProtocolError('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer')), 'https://s3.us.archive.org/zenodo.org-3275525/rhOverM_Asymptotic_GeometricUnits_CoM.h5') + + + Traceback (most recent call last): + File "./ingest_tool.py", line 208, in <module> + main() + File "./ingest_tool.py", line 204, in main + args.func(args) + File "./ingest_tool.py", line 57, in run_requests + result = fileset_worker.process(request) + File "/srv/sandcrawler/src/python/sandcrawler/ingest_fileset.py", line 375, in process + archive_result = strategy_helper.process(dataset_meta) + File "/srv/sandcrawler/src/python/sandcrawler/fileset_strategies.py", line 130, in process + r.raise_for_status() + File "/srv/sandcrawler/src/python/.venv/lib/python3.8/site-packages/requests/models.py", line 953, in raise_for_status + raise HTTPError(http_error_msg, response=self) + requests.exceptions.HTTPError: 404 Client Error: Not Found for url: https://ndownloader.figshare.com/files/5474201 + +download sometimes just slowly time out, like after a day or more + + + Traceback (most recent call last): + File "./ingest_tool.py", line 208, in <module> + main() + File "./ingest_tool.py", line 204, in main + args.func(args) + File "./ingest_tool.py", line 57, in run_requests + result = fileset_worker.process(request) + File "/srv/sandcrawler/src/python/sandcrawler/ingest_fileset.py", line 381, in process + archive_result = strategy_helper.process(dataset_meta) + File "/srv/sandcrawler/src/python/sandcrawler/fileset_strategies.py", line 155, in process + file_meta = gen_file_metadata_path(local_path, allow_empty=True) + File "/srv/sandcrawler/src/python/sandcrawler/misc.py", line 89, in gen_file_metadata_path + mimetype = magic.Magic(mime=True).from_file(path) + File "/srv/sandcrawler/src/python/.venv/lib/python3.8/site-packages/magic/__init__.py", line 111, in from_file + with _real_open(filename): + FileNotFoundError: [Errno 2] No such file or directory: '/tmp/sandcrawler/figshare.com-7925396-v1/HG02070.dedup.realigned.recalibrated.hc.g.vcf.gz' + + + Traceback (most recent call last): + File "./ingest_tool.py", line 208, in <module> + main() + File "./ingest_tool.py", line 204, in main + args.func(args) + File "./ingest_tool.py", line 57, in run_requests + result = fileset_worker.process(request) + File "/srv/sandcrawler/src/python/sandcrawler/ingest_fileset.py", line 314, in process + dataset_meta = platform_helper.process_request(request, resource, html_biblio) + File "/srv/sandcrawler/src/python/sandcrawler/fileset_platforms.py", line 208, in process_request + obj_latest = obj["data"]["latestVersion"] + KeyError: 'latestVersion' + +Fixed the above, trying again: + + git log | head -n1 + # commit ffdc901fa067db55fe6cfeb8d0c3807d29df092c + + Wed Dec 15 21:57:42 UTC 2021 + + zcat /srv/sandcrawler/tasks/ingest_dataset_combined.json.gz \ + | shuf \ + | parallel -j4 --linebuffer --round-robin --pipe ./ingest_tool.py requests --no-spn2 --enable-sentry - \ + | pv -l \ + > /srv/sandcrawler/tasks/ingest_dataset_combined_results4.json + +Zenodo seems really slow, let's try filtering those out: + + zcat /srv/sandcrawler/tasks/ingest_dataset_combined.json.gz \ + | rg -v 10.5281 \ + | shuf \ + | parallel -j8 --linebuffer --round-robin --pipe ./ingest_tool.py requests --no-spn2 --enable-sentry - \ + | pv -l \ + > /srv/sandcrawler/tasks/ingest_dataset_combined_results5.json + # 3.76k 15:12:53 [68.7m/s] + + zcat /srv/sandcrawler/tasks/ingest_dataset_combined.json.gz \ + | rg -v 10.5281 \ + | shuf \ + | parallel -j8 --linebuffer --round-robin --pipe ./ingest_tool.py requests --no-spn2 --enable-sentry - \ + | pv -l \ + > /srv/sandcrawler/tasks/ingest_dataset_combined_results6.json + +## Fatcat Import + + wc -l ingest_dataset_combined_results*.json + 126 ingest_dataset_combined_results2.json + 153 ingest_dataset_combined_results3.json + 275 ingest_dataset_combined_results4.json + 3762 ingest_dataset_combined_results5.json + 7736 ingest_dataset_combined_results6.json + 182 ingest_dataset_combined_results.json + 5 ingest_dataset_combined_results.ramp.json + 12239 total + + cat ingest_dataset_combined_results*.json \ + | rg '^\{' \ + | jq '[.request.fatcat.release_ident, . | tostring] | @tsv' -r \ + | sort \ + | uniq --check-chars 26 \ + | cut -f2 \ + | rg -v '\\\\' \ + | pv -l \ + > uniq_ingest_dataset_combined_results.json + # 9.48k 0:00:06 [1.54k/s] + + cat uniq_ingest_dataset_combined_results.json | jq .status -r | sort | uniq -c | sort -nr + 7941 no-capture + 374 platform-404 + 369 terminal-bad-status + 348 success-file + 172 success + 79 platform-scope + 77 error-platform-download + 47 empty-manifest + 27 platform-restricted + 20 too-many-files + 12 redirect-loop + 6 error-archiveorg-upload + 3 too-large-size + 3 mismatch + 1 no-platform-match + + cat uniq_ingest_dataset_combined_results.json \ + | rg '"success' \ + | jq 'select(.status == "success") | .' -c \ + > uniq_ingest_dataset_combined_results.success.json + + cat uniq_ingest_dataset_combined_results.json \ + | rg '"success' \ + | jq 'select(.status == "success-file") | .' -c \ + > uniq_ingest_dataset_combined_results.success-file.json + +On fatcat QA instance: + + git log | head -n1 + # commit cca680e2cc4768a4d45e199f6256a433b25b4075 + + head /tmp/uniq_ingest_dataset_combined_results.success-file.json \ + | ./fatcat_import.py ingest-fileset-results - + # Counter({'total': 10, 'skip': 10, 'skip-single-file': 10, 'insert': 0, 'update': 0, 'exists': 0}) + + head /tmp/uniq_ingest_dataset_combined_results.success-file.json \ + | ./fatcat_import.py ingest-file-results - + # Counter({'total': 10, 'skip': 10, 'skip-ingest-type': 10, 'insert': 0, 'update': 0, 'exists': 0}) + +Need to update fatcat file worker to support single-file filesets... was that the plan? + + head /tmp/uniq_ingest_dataset_combined_results.success.json \ + | ./fatcat_import.py ingest-fileset-results - + # Counter({'total': 10, 'skip': 10, 'skip-no-access-url': 10, 'insert': 0, 'update': 0, 'exists': 0}) + + # Counter({'total': 10, 'insert': 10, 'skip': 0, 'update': 0, 'exists': 0}) + + +## Summary + +As follow-up, it may be worth doing another manual round of ingest requests. +After that, would be good to fill in "glue" code so that this can be done with +kafka workers, and do re-tries/dumps using sandcrawler SQL database. Then can +start scaling up more ingest, using ingest tool, "bulk mode" processing, +heritrix crawls from `no-capture` dumps, etc, similar to bulk file ingest +process. + +For scaling, let's do a "full" ingest request generation of all datasets, and +crawl the base URL with heritrix, in fast/direct mode. Expect this to be tens +of millions of mostly DOIs (doi.org URLs), should crawl quickly. + +Then, do bulk downloading with ingest worker, perhaps on misc-vm or aitio. +uploading large datasets to archive.org, but not doing SPN web requests. Feed +the resulting huge file seedlist into a heritrix crawl to download web files. + +Will need to add support for more specific platforms. + + +### Huge Bulk Ingest Prep + +On prod instance: + + ./fatcat_ingest.py --ingest-type dataset --allow-non-oa query type:dataset \ + | pv -l \ + | gzip \ + > /srv/fatcat/tasks/ingest_dataset_bulk.2022-01-05.json.gz + # Expecting 11264787 release objects in search queries + # TIMEOUT ERROR + # 6.07M 19:13:02 [87.7 /s] (partial) + +As follow-up, should do a full batch (not partial). For now search index is too +unreliable (read timeouts). + + zcat ingest_dataset_bulk.2022-01-05.partial.json.gz \ + | jq .base_url -r \ + | sort -u \ + | shuf \ + | awk '{print "F+ " $1}' \ + > ingest_dataset_bulk.2022-01-05.partial.schedule + +## Retries (2022-01-12) + +This is after having done a bunch of crawling. + + cat ingest_dataset_combined_results6.json \ + | rg '"no-capture"' \ + | jq 'select(.status = "no-capture")' -c \ + | jq .request -c \ + | pv -l \ + > ingest_dataset_retry.json + => 6.51k 0:00:01 [3.55k/s] + + cat /srv/sandcrawler/tasks/ingest_dataset_retry.json \ + | parallel -j4 --linebuffer --round-robin --pipe ./ingest_tool.py requests --no-spn2 --enable-sentry - \ + | pv -l \ + > /srv/sandcrawler/tasks/ingest_dataset_retry_results.json + diff --git a/notes/ingest/2022-01-06_patch_crawl.md b/notes/ingest/2022-01-06_patch_crawl.md new file mode 100644 index 0000000..ffd6669 --- /dev/null +++ b/notes/ingest/2022-01-06_patch_crawl.md @@ -0,0 +1,156 @@ + +Starting another paper fulltext patch crawl, targetting recent OA content which +has failed to ingest, and platforms (arxiv, etc). + +Specifically: + +- "daily" changelog ingest requests from all time, which failed with various status codes +- pdf no-capture +- SPN errors +- terminal-bad-status with 5xx, 429 +- gateway-timeout +- html no-capture +- html-resource-no-capture + +Most of these are dumped in a single complex query (below), + +TODO: html-resource-no-capture (from error message? or do SPN requests separately?) + + +## Initial 'no-capture' Seedlist + +Dump terminal URLs (will do ingest requests later, using similar command): + + COPY ( + SELECT ingest_file_result.terminal_url + -- SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ( + ingest_request.ingest_type = 'pdf' + OR ingest_request.ingest_type = 'html' + ) + AND ( + ingest_file_result.status = 'no-capture' + OR ingest_file_result.status = 'cdx-error' + OR ingest_file_result.status = 'wayback-error' + OR ingest_file_result.status = 'wayback-content-error' + OR ingest_file_result.status = 'petabox-error' + OR ingest_file_result.status = 'spn2-cdx-lookup-failure' + OR ingest_file_result.status = 'gateway-timeout' + OR ( + ingest_file_result.status = 'terminal-bad-status' + AND ( + ingest_file_result.terminal_status_code = 429 + OR ingest_file_result.terminal_status_code = 500 + OR ingest_file_result.terminal_status_code = 502 + OR ingest_file_result.terminal_status_code = 503 + ) + ) + ) + AND ( + ingest_request.link_source = 'oai' + OR ingest_request.link_source = 'doi' + OR ingest_request.link_source = 'arxiv' + OR ingest_request.link_source = 'doaj' + OR ingest_request.link_source = 'unpaywall' + OR ingest_request.link_source = 'pmc' + ) + + AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%' + AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%' + AND ingest_request.link_source_id NOT LIKE 'oai:repec:%' + AND ingest_request.link_source_id NOT LIKE 'oai:n/a:%' + AND ingest_request.link_source_id NOT LIKE 'oai:quod.lib.umich.edu:%' + AND ingest_request.link_source_id NOT LIKE 'oai:americanae.aecid.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:www.irgrid.ac.cn:%' + AND ingest_request.link_source_id NOT LIKE 'oai:espace.library.uq.edu:%' + AND ingest_request.link_source_id NOT LIKE 'oai:edoc.mpg.de:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bibliotecadigital.jcyl.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:repository.erciyes.edu.tr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:krm.or.kr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hypotheses.org:%' + + AND ingest_file_result.terminal_url NOT LIKE '%mdz-nbn-resolving.de%' + AND ingest_file_result.terminal_url NOT LIKE '%edoc.mpg.de%' + AND ingest_file_result.terminal_url NOT LIKE '%doaj.org%' + AND ingest_file_result.terminal_url NOT LIKE '%orcid.org%' + AND ingest_file_result.terminal_url NOT LIKE '%gateway.isiknowledge.com%' + -- AND ingest_file_result.terminal_url NOT LIKE '%europmc.org%' + -- AND ingest_file_result.terminal_url NOT LIKE '%arxiv.org%' + -- AND ingest_file_result.terminal_url NOT LIKE 'https://doi.org/10.%' + + AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%' + AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%' + AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%' + AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%' + AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%' + AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%' + AND ingest_file_result.terminal_url NOT LIKE '%researchgate.net%' + AND ingest_file_result.terminal_url NOT LIKE '%muse.jhu.edu%' + AND ingest_file_result.terminal_url NOT LIKE '%omicsonline.org%' + AND ingest_file_result.terminal_url NOT LIKE '%link.springer.com%' + AND ingest_file_result.terminal_url NOT LIKE '%ieeexplore.ieee.org%' + + -- AND ingest_file_result.terminal_url NOT LIKE '%zenodo.org%' + AND ingest_file_result.terminal_url NOT LIKE '%t2r2.star.titech.ac.jp%' + AND ingest_file_result.terminal_url NOT LIKE '%www.google.com%' + -- AND ingest_file_result.terminal_url NOT LIKE '%figshare.com%' + -- AND ingest_file_result.terminal_url NOT LIKE '%springernature.figshare.com%' + -- ) TO '/srv/sandcrawler/tasks/patch_ingest_request_2022-01-12.rows.json'; + ) TO '/srv/sandcrawler/tasks/patch_terminal_url.2022-01-12.txt'; + => COPY 6389683 + +TODO: filter out archive.org/www.archive.org + + cat patch_terminal_url.2022-01-12.txt \ + | rg -v www.archive.org \ + | rg '://' \ + | rg -v '://10\.' \ + | rg -v '://172\.' \ + | rg -i '^http' \ + | sort -u -S 4G \ + | pv -l \ + > patch_terminal_url.2022-01-12.uniq.txt + => 5.73M 0:00:47 [ 120k/s] + + # note: tweaks and re-ran the above after inspecting this output + cut -f3 -d/ patch_terminal_url.2022-01-12.uniq.txt | sort | uniq -c | sort -nr | head -n25 + 799045 doi.org + 317557 linkinghub.elsevier.com + 211091 arxiv.org + 204334 iopscience.iop.org + 139758 dialnet.unirioja.es + 130331 www.scielo.br + 124626 www.persee.fr + 85764 digitalrepository.unm.edu + 83913 www.mdpi.com + 79662 www.degruyter.com + 75703 www.e-periodica.ch + 72206 dx.doi.org + 69068 escholarship.org + 67848 idus.us.es + 57907 zenodo.org + 56624 ir.opt.ac.cn + 54983 projecteuclid.org + 52226 rep.bntu.by + 48376 osf.io + 48009 pubs.rsc.org + 46947 publikationen.ub.uni-frankfurt.de + 45564 www.research-collection.ethz.ch + 45153 dk.um.si + 43313 www.ssoar.info + 40543 scholarworks.umt.edu + +TODO: cleanup ingest request table in sandcrawler-db: +- remove filtered OAI-PMH prefixes +- remove any invalid `base_url` (?) diff --git a/notes/ingest/2022-01-13_doi_crawl.md b/notes/ingest/2022-01-13_doi_crawl.md new file mode 100644 index 0000000..6f3b2c8 --- /dev/null +++ b/notes/ingest/2022-01-13_doi_crawl.md @@ -0,0 +1,157 @@ + +Could roll this in to current patch crawl instead of starting a new crawl from scratch. + +## KBART "almost complete" experimentation + +Random 10 releases: + + cat missing_releases.json | shuf -n10 | jq .ident -r | awk '{print "https://fatcat.wiki/release/" $1}' + https://fatcat.wiki/release/suggmo4fnfaave64frttaqqoja - domain gone + https://fatcat.wiki/release/uw2dq2p3mzgolk4alze2smv7bi - DOAJ, then OJS PDF link. sandcrawler failed, fixed + https://fatcat.wiki/release/fjamhzxxdndq5dcariobxvxu3u - OJS; sandcrawler fix works + https://fatcat.wiki/release/z3ubnko5ifcnbhhlegc24kya2u - OJS; sandcrawler failed, fixed (separate pattern) + https://fatcat.wiki/release/pysc3w2cdbehvffbyca4aqex3i - DOAJ, OJS bilingual, failed with 'redirect-loop'. force re-crawl worked for one copy + https://fatcat.wiki/release/am2m5agvjrbvnkstke3o3xtney - not attempted previously (?), success + https://fatcat.wiki/release/4zer6m56zvh6fd3ukpypdu7ita - cover page of journal (not an article). via crossref + https://fatcat.wiki/release/6njc4rdaifbg5jye3bbfdhkbsu - OJS; success + https://fatcat.wiki/release/jnmip3z7xjfsdfeex4piveshvu - OJS; not crawled previously; success + https://fatcat.wiki/release/wjxxcknnpjgtnpbzhzge6rkndi - no-pdf-link, fixed + +Try some more! + + https://fatcat.wiki/release/ywidvbhtfbettmfj7giu2htbdm - not attempted, success + https://fatcat.wiki/release/ou2kqv5k3rbk7iowfohpitelfa - OJS, not attempted, success? + https://fatcat.wiki/release/gv2glplmofeqrlrvfs524v5qa4 - scirp.org; 'redirect-loop'; HTML/PDF/XML all available; then 'gateway-timeout' on retry + https://fatcat.wiki/release/5r5wruxyyrf6jneorux3negwpe - gavinpublishers.com; broken site + https://fatcat.wiki/release/qk4atst6svg4hb73jdwacjcacu - horyzonty.ignatianum.edu.pl; broken DOI + https://fatcat.wiki/release/mp5ec3ycrjauxeve4n4weq7kqm - old cert; OJS; success + https://fatcat.wiki/release/sqnovcsmizckjdlwg3hipxrfqm - not attempted, success + https://fatcat.wiki/release/42ruewjuvbblxgnek6fpj5lp5m - OJS URL, but domain broken + https://fatcat.wiki/release/crg6aiypx5enveldvmwy5judp4 - volume/cover (stub) + https://fatcat.wiki/release/jzih3vvxj5ctxk3tbzyn5kokha - success + + +## Seeds: fixed OJS URLs + +Made some recent changes to sandcrawler, should re-attempt OJS URLs, particularly from DOI or DOAJ, with pattern like: + +- `no-pdf-link` with terminal URL like `/article/view/` +- `redirect-loop` with terminal URL like `/article/view/` + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_file_result.status = 'no-pdf-link' + AND ( + ingest_file_result.terminal_url LIKE '%/article/view/%' + OR ingest_file_result.terminal_url LIKE '%/article/download/%' + ) + AND ( + ingest_request.link_source = 'doi' + OR ingest_request.link_source = 'doaj' + OR ingest_request.link_source = 'unpaywall' + ) + ) TO '/srv/sandcrawler/tasks/retry_ojs_nopdflink.2022-01-13.rows.json'; + => COPY 326577 + + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/retry_ojs_nopdflink.2022-01-13.rows.json > /srv/sandcrawler/tasks/retry_ojs_nopdflink.2022-01-13.json + cat /srv/sandcrawler/tasks/retry_ojs_nopdflink.2022-01-13.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +Done/running. + + COPY ( + SELECT ingest_file_result.terminal_url + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ( + ingest_file_result.status = 'redirect-loop' + OR ingest_file_result.status = 'link-loop' + ) + AND ( + ingest_file_result.terminal_url LIKE '%/article/view/%' + OR ingest_file_result.terminal_url LIKE '%/article/download/%' + ) + ) TO '/srv/sandcrawler/tasks/retry_ojs_loop.2022-01-13.txt'; + => COPY 342415 + + cat /srv/sandcrawler/tasks/retry_ojs_loop.2022-01-13.txt | awk '{print "F+ " $1}' > /srv/sandcrawler/tasks/retry_ojs_loop.2022-01-13.schedule + +Done/seeded. + +## Seeds: scitemed.com + +Batch retry sandcrawler `no-pdf-link` with terminal URL like: `scitemed.com/article` + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_file_result.status = 'no-pdf-link' + AND ingest_file_result.terminal_url LIKE '%/article/view/%' + AND ( + ingest_request.link_source = 'doi' + OR ingest_request.link_source = 'doaj' + OR ingest_request.link_source = 'unpaywall' + ) + ) TO '/srv/sandcrawler/tasks/retry_scitemed.2022-01-13.rows.json'; + # SKIPPED + +Actually there are very few of these. + +## Seeds: non-OA paper DOIs + +There are many DOIs out there which are likely to be from small publishers, on +the web, and would ingest just fine (eg, in OJS). + + fatcat-cli search release in_ia:false is_oa:false 'doi:*' release_type:article-journal 'container_id:*' '!publisher_type:big5' --count + 30,938,106 + + fatcat-cli search release in_ia:false is_oa:false 'doi:*' release_type:article-journal 'container_id:*' '!publisher_type:big5' 'preservation:none' --count + 6,664,347 + + fatcat-cli search release in_ia:false is_oa:false 'doi:*' release_type:article-journal 'container_id:*' '!publisher_type:big5' 'in_kbart:false' --count + 8,258,111 + +Do the 8 million first, then maybe try the 30.9 million later? Do sampling to +see how many are actually accessible? From experience with KBART generation, +many of these are likely to crawl successfully. + + ./fatcat_ingest.py --ingest-type pdf --allow-non-oa query 'in_ia:false is_oa:false doi:* release_type:article-journal container_id:* !publisher_type:big5 in_kbart:false' \ + | pv -l \ + | gzip \ + > /srv/fatcat/tasks/ingest_nonoa_doi.json.gz + # Expecting 8255693 release objects in search queries + +## Seeds: not daily, but OA DOI + +There are a bunch of things we are no longer attempting daily, but should do +heritrix crawls of periodically. + +TODO: maybe in daily crawling, should check container coverage and see if most URLs are bright, and if so do ingest? hrm +TODO: What are they? zenodo.org? + +## Seeds: HTML and XML links from HTML biblio + + kafkacat -C -b wbgrp-svc284.us.archive.org:9092 -t sandcrawler-prod.ingest-file-results -e \ + | pv -l \ + | rg '"(html|xml)_fulltext_url"' \ + | rg '"no-pdf-link"' \ + | gzip \ + > ingest_file_result_fulltext_urls.2022-01-13.json.gz + +## Seeds: most doi.org terminal non-success + +Unless it is a 404, should retry. |