diff options
author | Bryan Newbold <bnewbold@archive.org> | 2022-02-08 17:49:39 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2022-02-08 17:49:50 -0800 |
commit | 3a6fc1f1c26885fd7a44b13ee156fcdb61e6aadd (patch) | |
tree | 077afcd3c48553dbc65760db047b2e81ba080a73 /notes/ingest/2022-01-06_patch_crawl.md | |
parent | 067c97a59a4a8728add7b9e561082a5403be52e5 (diff) | |
download | sandcrawler-3a6fc1f1c26885fd7a44b13ee156fcdb61e6aadd.tar.gz sandcrawler-3a6fc1f1c26885fd7a44b13ee156fcdb61e6aadd.zip |
more patch crawling
Diffstat (limited to 'notes/ingest/2022-01-06_patch_crawl.md')
-rw-r--r-- | notes/ingest/2022-01-06_patch_crawl.md | 138 |
1 files changed, 137 insertions, 1 deletions
diff --git a/notes/ingest/2022-01-06_patch_crawl.md b/notes/ingest/2022-01-06_patch_crawl.md index ffd6669..bc1d4d5 100644 --- a/notes/ingest/2022-01-06_patch_crawl.md +++ b/notes/ingest/2022-01-06_patch_crawl.md @@ -21,7 +21,7 @@ TODO: html-resource-no-capture (from error message? or do SPN requests separatel Dump terminal URLs (will do ingest requests later, using similar command): - COPY ( + COPY ( SELECT ingest_file_result.terminal_url -- SELECT row_to_json(ingest_request.*) FROM ingest_request @@ -154,3 +154,139 @@ TODO: filter out archive.org/www.archive.org TODO: cleanup ingest request table in sandcrawler-db: - remove filtered OAI-PMH prefixes - remove any invalid `base_url` (?) + +## More Seedlist (2022-02-08) + + COPY ( + SELECT ingest_file_result.terminal_url + -- SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ( + ingest_request.ingest_type = 'pdf' + OR ingest_request.ingest_type = 'html' + ) + AND ingest_file_result.updated >= '2022-01-12' + AND ( + ingest_file_result.status = 'no-capture' + OR ingest_file_result.status = 'cdx-error' + OR ingest_file_result.status = 'wayback-error' + OR ingest_file_result.status = 'wayback-content-error' + OR ingest_file_result.status = 'petabox-error' + OR ingest_file_result.status = 'spn2-cdx-lookup-failure' + OR ingest_file_result.status = 'gateway-timeout' + OR ( + ingest_file_result.status = 'terminal-bad-status' + AND ( + ingest_file_result.terminal_status_code = 429 + OR ingest_file_result.terminal_status_code = 500 + OR ingest_file_result.terminal_status_code = 502 + OR ingest_file_result.terminal_status_code = 503 + ) + ) + ) + AND ( + ingest_request.link_source = 'oai' + OR ingest_request.link_source = 'doi' + OR ingest_request.link_source = 'arxiv' + OR ingest_request.link_source = 'doaj' + OR ingest_request.link_source = 'unpaywall' + OR ingest_request.link_source = 'pmc' + ) + + AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%' + AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%' + AND ingest_request.link_source_id NOT LIKE 'oai:repec:%' + AND ingest_request.link_source_id NOT LIKE 'oai:n/a:%' + AND ingest_request.link_source_id NOT LIKE 'oai:quod.lib.umich.edu:%' + AND ingest_request.link_source_id NOT LIKE 'oai:americanae.aecid.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:www.irgrid.ac.cn:%' + AND ingest_request.link_source_id NOT LIKE 'oai:espace.library.uq.edu:%' + AND ingest_request.link_source_id NOT LIKE 'oai:edoc.mpg.de:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bibliotecadigital.jcyl.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:repository.erciyes.edu.tr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:krm.or.kr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hypotheses.org:%' + + AND ingest_file_result.terminal_url NOT LIKE '%mdz-nbn-resolving.de%' + AND ingest_file_result.terminal_url NOT LIKE '%edoc.mpg.de%' + AND ingest_file_result.terminal_url NOT LIKE '%doaj.org%' + AND ingest_file_result.terminal_url NOT LIKE '%orcid.org%' + AND ingest_file_result.terminal_url NOT LIKE '%gateway.isiknowledge.com%' + -- AND ingest_file_result.terminal_url NOT LIKE '%europmc.org%' + -- AND ingest_file_result.terminal_url NOT LIKE '%arxiv.org%' + -- AND ingest_file_result.terminal_url NOT LIKE 'https://doi.org/10.%' + + AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%' + AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%' + AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%' + AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%' + AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%' + AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%' + AND ingest_file_result.terminal_url NOT LIKE '%researchgate.net%' + AND ingest_file_result.terminal_url NOT LIKE '%muse.jhu.edu%' + AND ingest_file_result.terminal_url NOT LIKE '%omicsonline.org%' + AND ingest_file_result.terminal_url NOT LIKE '%link.springer.com%' + AND ingest_file_result.terminal_url NOT LIKE '%ieeexplore.ieee.org%' + + -- AND ingest_file_result.terminal_url NOT LIKE '%zenodo.org%' + AND ingest_file_result.terminal_url NOT LIKE '%t2r2.star.titech.ac.jp%' + AND ingest_file_result.terminal_url NOT LIKE '%www.google.com%' + -- AND ingest_file_result.terminal_url NOT LIKE '%figshare.com%' + -- AND ingest_file_result.terminal_url NOT LIKE '%springernature.figshare.com%' + AND ingest_file_result.terminal_url NOT LIKE '%www.archive.org%' + -- ) TO '/srv/sandcrawler/tasks/patch_ingest_request_2022-02-08.rows.json'; + ) TO '/srv/sandcrawler/tasks/patch_terminal_url.2022-02-08.txt'; + => COPY 444764 + + cat patch_terminal_url.2022-02-08.txt \ + | rg -v www.archive.org \ + | rg '://' \ + | rg -v '://10\.' \ + | rg -v '://172\.' \ + | rg -i '^http' \ + | sort -u -S 4G \ + | pv -l \ + > patch_terminal_url.2022-02-08.uniq.txt + => 426k 0:00:04 [ 103k/s] + + cut -f3 -d/ patch_terminal_url.2022-02-08.uniq.txt | sort | uniq -c | sort -nr | head -n25 + 60123 www.degruyter.com + 59314 arxiv.org + 43674 zenodo.org + 17771 doi.org + 9501 linkinghub.elsevier.com + 9379 www.mdpi.com + 5691 opendata.uni-halle.de + 5578 scholarlypublishingcollective.org + 5451 era.library.ualberta.ca + 4982 www.cairn.info + 4306 www.taylorfrancis.com + 4189 papers.ssrn.com + 4157 apps.crossref.org + 4089 www.sciencedirect.com + 4033 mdpi-res.com + 3763 dlc.mpg.de + 3408 osf.io + 2603 www.frontiersin.org + 2594 watermark.silverchair.com + 2569 journals.lww.com + 1787 underline.io + 1680 archiviostorico.fondazione1563.it + 1658 www.jstage.jst.go.jp + 1611 cyberleninka.ru + 1535 www.schoeningh.de + + cat patch_terminal_url.2022-02-08.txt | awk '{print "F+ " $1}' > patch_terminal_url.2022-02-08.schedule + => Done + +Copied to crawler svc206 and added to frontier. |