aboutsummaryrefslogtreecommitdiffstats
path: root/sql/ingest_again.md
blob: 3b4b9901bcc2f55e8906d4d7528825e47e05e463 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158

## re-ingest some broken

    COPY (
        SELECT row_to_json(ingest_request.*) FROM ingest_request
        LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url
        WHERE ingest_request.ingest_type = 'pdf'
            AND ingest_file_result.ingest_type = 'pdf'
            AND ingest_file_result.updated < NOW() - '1 hour'::INTERVAL
            AND ingest_file_result.updated > NOW() - '12 day'::INTERVAL
            AND ingest_file_result.hit = false
            AND ingest_file_result.status like 'spn2-%'
            AND ingest_file_result.status != 'spn2-error:invalid-url-syntax'
            AND ingest_file_result.status != 'spn2-error:spn2-error:filesize-limit'
    ) TO '/grande/snapshots/reingest_spn2-error_current.rows.json';

    COPY (
        SELECT row_to_json(ingest_request.*) FROM ingest_request
        LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url
        WHERE ingest_request.ingest_type = 'pdf'
            AND ingest_file_result.ingest_type = 'pdf'
            AND ingest_file_result.hit = false
            AND ingest_file_result.status like 'cdx-error'
            AND ingest_file_result.updated < NOW() - '1 hour'::INTERVAL
            AND ingest_file_result.updated > NOW() - '12 day'::INTERVAL
            AND (ingest_request.ingest_request_source = 'fatcat-changelog'
                 OR ingest_request.ingest_request_source = 'fatcat-ingest')
    ) TO '/grande/snapshots/reingest_cdx-error_current.rows.json';

    COPY (
        SELECT row_to_json(ingest_request.*) FROM ingest_request
        LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url
        WHERE ingest_request.ingest_type = 'pdf'
            AND ingest_file_result.ingest_type = 'pdf'
            AND ingest_file_result.hit = false
            AND ingest_file_result.status like 'cdx-error'
            AND ingest_file_result.updated < NOW() - '1 hour'::INTERVAL
            AND ingest_file_result.updated > NOW() - '12 day'::INTERVAL
            AND (ingest_request.ingest_request_source != 'fatcat-changelog'
                 AND ingest_request.ingest_request_source != 'fatcat-ingest')
    ) TO '/grande/snapshots/reingest_cdx-error_bulk_current.rows.json';

    COPY (
        SELECT row_to_json(ingest_request.*) FROM ingest_request
        LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url
        WHERE ingest_request.ingest_type = 'pdf'
            AND ingest_file_result.ingest_type = 'pdf'
            AND ingest_file_result.hit = false
            AND ingest_file_result.status like 'wayback-error'
            AND ingest_file_result.updated < NOW() - '1 hour'::INTERVAL
            AND ingest_file_result.updated > NOW() - '12 day'::INTERVAL
    ) TO '/grande/snapshots/reingest_wayback-error_current.rows.json';

    COPY (
        SELECT row_to_json(ingest_request.*) FROM ingest_request
        LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url
        WHERE ingest_request.ingest_type = 'pdf'
            AND ingest_file_result.ingest_type = 'pdf'
            AND ingest_file_result.hit = false
            AND ingest_file_result.status like 'gateway-timeout'
            AND ingest_file_result.updated < NOW() - '1 hour'::INTERVAL
            AND ingest_file_result.updated > NOW() - '12 day'::INTERVAL
            AND (ingest_request.ingest_request_source = 'fatcat-changelog'
                 OR ingest_request.ingest_request_source = 'fatcat-ingest')
    ) TO '/grande/snapshots/reingest_gateway-timeout.rows.json';

    COPY (
        SELECT row_to_json(ingest_request.*) FROM ingest_request
        LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url
        WHERE ingest_request.ingest_type = 'pdf'
            AND ingest_file_result.ingest_type = 'pdf'
            AND ingest_file_result.hit = false
            AND ingest_file_result.status like 'petabox-error'
            AND ingest_file_result.updated < NOW() - '1 hour'::INTERVAL
            AND ingest_file_result.updated > NOW() - '12 day'::INTERVAL
            AND (ingest_request.ingest_request_source = 'fatcat-changelog'
                 OR ingest_request.ingest_request_source = 'fatcat-ingest')
    ) TO '/grande/snapshots/reingest_petabox-error_current.rows.json';

Transform:

    ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_spn2-error_current.rows.json | shuf > reingest_spn2-error_current.json
    ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_cdx-error_current.rows.json | shuf > reingest_cdx-error_current.json
    ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_cdx-error_bulk_current.rows.json | shuf > reingest_cdx-error_bulk_current.json
    ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_wayback-error_current.rows.json | shuf > reingest_wayback-error_current.json
    ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_gateway-timeout.rows.json | shuf > reingest_gateway-timeout.json
    ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_petabox-error_current.rows.json | shuf > reingest_petabox-error_current.json

Push to kafka (shuffled):

    cat reingest_spn2-error_current.json reingest_cdx-error_current.json reingest_wayback-error_current.json reingest_petabox-error_current.json | shuf | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1

    cat reingest_gateway-timeout.json | shuf | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p 0

    cat reingest_cdx-error_bulk_current.json | shuf | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1

Push to kafka (not shuffled):

    cat reingest_spn2-error_current.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1
    cat reingest_cdx-error_current.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1
    cat reingest_cdx-error_bulk_current.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
    cat reingest_wayback-error_current.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1
    cat reingest_gateway-timeout.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1
    cat reingest_petabox-error_current.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1

## just recent fatcat-ingest

    COPY (
        SELECT row_to_json(ingest_request.*) FROM ingest_request
        LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url
        WHERE ingest_request.ingest_type = 'pdf'
            AND ingest_file_result.ingest_type = 'pdf'
            AND ingest_file_result.updated < NOW() - '1 hour'::INTERVAL
            -- AND ingest_file_result.updated > NOW() - '24 hour'::INTERVAL
            AND ingest_file_result.updated > NOW() - '7 day'::INTERVAL
            AND ingest_file_result.hit = false
            AND (ingest_file_result.status like 'spn2-%'
                 OR ingest_file_result.status like 'cdx-error'
                 OR ingest_file_result.status like 'gateway-timeout'
                 OR ingest_file_result.status like 'wayback-error'
            )
            AND ingest_file_result.status != 'spn2-error:invalid-url-syntax'
            AND ingest_file_result.status != 'spn2-error:spn2-error:filesize-limit'
            AND ingest_request.ingest_request_source = 'fatcat-ingest'
    ) TO '/grande/snapshots/reingest_fatcat_current.rows.json';

    # note: shuf
    ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_fatcat_current.rows.json | shuf > reingest_fatcat_current.json

    cat reingest_fatcat_current.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1

## specific domains

protocols.io:

    SELECT ingest_file_result.ingest_type, ingest_file_result.status, COUNT(*)
    FROM ingest_file_result
    LEFT JOIN ingest_request
        ON ingest_file_result.ingest_type = ingest_request.ingest_type
        AND ingest_file_result.base_url = ingest_request.base_url
    WHERE ingest_request.ingest_type = 'pdf'
        AND ingest_request.base_url LIKE '%10.17504/protocols.io%'
    GROUP BY ingest_file_result.ingest_type, ingest_file_result.status
    ORDER BY COUNT DESC
    LIMIT 20;

biorxiv/medrxiv:

    SELECT ingest_file_result.ingest_type, ingest_file_result.status, COUNT(*)
    FROM ingest_file_result
    LEFT JOIN ingest_request
        ON ingest_file_result.ingest_type = ingest_request.ingest_type
        AND ingest_file_result.base_url = ingest_request.base_url
    WHERE ingest_request.ingest_type = 'pdf'
        AND ingest_request.base_url LIKE '%10.1101/20%'
    GROUP BY ingest_file_result.ingest_type, ingest_file_result.status
    ORDER BY COUNT DESC
    LIMIT 20;