blob: 415f23b3e2c51e4597daf8a5a6e7540170d0d450 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
|
Heritrix follow-up crawl for recent bulk ingest of DOAJ, JALC, and DBLP URLs.
export PATCHDATE=2022-07-29
export CRAWLVM=wbgrp-svc279.us.archive.org
export CRAWLNAME=TARGETED-ARTICLE-CRAWL-2022-07
## Seedlist Query
Terminal URLs dump:
COPY (
SELECT row_to_json(t) FROM (
SELECT ingest_file_result.terminal_url, ingest_request.*
FROM ingest_request
LEFT JOIN ingest_file_result
ON ingest_file_result.ingest_type = ingest_request.ingest_type
AND ingest_file_result.base_url = ingest_request.base_url
WHERE
(
ingest_request.ingest_type = 'pdf'
OR ingest_request.ingest_type = 'html'
)
-- AND ingest_file_result.updated >= '2022-01-12'
AND (
ingest_file_result.status = 'no-capture'
OR ingest_file_result.status = 'cdx-error'
OR ingest_file_result.status = 'wayback-error'
OR ingest_file_result.status = 'wayback-content-error'
OR ingest_file_result.status = 'petabox-error'
OR ingest_file_result.status LIKE 'spn2-%'
OR ingest_file_result.status = 'gateway-timeout'
OR (
ingest_file_result.status = 'terminal-bad-status'
AND (
ingest_file_result.terminal_status_code = 500
OR ingest_file_result.terminal_status_code = 502
OR ingest_file_result.terminal_status_code = 503
OR ingest_file_result.terminal_status_code = 429
)
)
)
AND (
ingest_request.link_source = 'doi'
OR ingest_request.link_source = 'doaj'
OR ingest_request.link_source = 'dblp'
OR ingest_request.link_source = 'arxiv'
OR ingest_request.link_source = 'pmc'
-- OR ingest_request.link_source = 'unpaywall'
-- OR ingest_request.link_source = 'oai'
)
AND ingest_file_result.terminal_url NOT LIKE '%mdz-nbn-resolving.de%'
AND ingest_file_result.terminal_url NOT LIKE '%edoc.mpg.de%'
AND ingest_file_result.terminal_url NOT LIKE '%orcid.org%'
AND ingest_file_result.terminal_url NOT LIKE '%gateway.isiknowledge.com%'
AND ingest_file_result.terminal_url NOT LIKE '%europmc.org%'
AND ingest_file_result.terminal_url NOT LIKE '%arxiv.org%'
-- AND ingest_file_result.terminal_url NOT LIKE 'https://doi.org/10.%'
AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
AND ingest_file_result.terminal_url NOT LIKE '%researchgate.net%'
AND ingest_file_result.terminal_url NOT LIKE '%muse.jhu.edu%'
AND ingest_file_result.terminal_url NOT LIKE '%omicsonline.org%'
AND ingest_file_result.terminal_url NOT LIKE '%link.springer.com%'
AND ingest_file_result.terminal_url NOT LIKE '%ieeexplore.ieee.org%'
-- AND ingest_file_result.terminal_url NOT LIKE '%zenodo.org%'
AND ingest_file_result.terminal_url NOT LIKE '%t2r2.star.titech.ac.jp%'
AND ingest_file_result.terminal_url NOT LIKE '%www.google.com%'
-- AND ingest_file_result.terminal_url NOT LIKE '%figshare.com%'
-- AND ingest_file_result.terminal_url NOT LIKE '%springernature.figshare.com%'
AND ingest_file_result.terminal_url NOT LIKE '%www.archive.org%'
) t
) TO '/srv/sandcrawler/tasks/patch_ingest_request_2022-07-29.rows.json';
=> COPY 3524573
cat /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.rows.json \
| rg -v "\\\\" \
| jq -r .terminal_url \
| rg '://' \
| rg -i '^http' \
| rg -v '://10\.' \
| rg -v '://172\.' \
| sort -u -S 4G \
| pv -l \
> /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.txt
=> 3.11M 0:01:08 [45.4k/s]
# check top domains
cut -f3 -d/ /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.txt | sort | uniq -c | sort -nr | head -n25
624948 doi.org
382492 www.jstage.jst.go.jp
275087 www.mdpi.com
157134 www.persee.fr
108979 www.sciencedirect.com
94375 www.scielo.br
50834 onlinelibrary.wiley.com
49991 journals.lww.com
30354 www.frontiersin.org
27963 doaj.org
27058 www.e-periodica.ch
24147 dl.acm.org
23389 aclanthology.org
22086 www.research-collection.ethz.ch
21589 medien.die-bonn.de
18866 www.ingentaconnect.com
18583 doi.nrct.go.th
18271 repositories.lib.utexas.edu
17634 hdl.handle.net
16366 archives.datapages.com
15146 cgscholar.com
13987 dl.gi.de
13188 www.degruyter.com
12503 ethos.bl.uk
12304 preprints.jmir.org
cat /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.txt | awk '{print "F+ " $1}' > /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.schedule
=> done
scp /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.schedule $CRAWLVM:/tmp
ssh $CRAWLVM sudo -u heritrix cp /tmp/patch_terminal_url.$PATCHDATE.schedule /0/ia-jobs/journal-crawls/$CRAWLNAME/action/
## Re-Ingest
Transform:
./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.requests.json
=> 3.52M 0:01:37 [36.2k/s]
Ingest:
cat /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.requests.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
|