aboutsummaryrefslogtreecommitdiffstats
path: root/notes/tasks/2022-11-21_andrzejklimczuk_cleanup.md
blob: 74d38577585575affab16e84f57e7542775c363a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132

Had a huge number of SPN requests for the andrzejklimczuk.com domain,
presumably from the author.

Many were duplicates (same file, multiple releases, often things like zenodo
duplication). Many were also GROBID 500s, due to truncated common crawl
captures.

Needed to cleanup! Basically sorted through a few editgroups manually, then
rejected all the rest and manually re-submitted with the below queries and
commands:

    SELECT COUNT(*) from ingest_request
    LEFT JOIN ingest_file_result ON
        ingest_file_result.ingest_type = ingest_request.ingest_type
        AND ingest_file_result.base_url = ingest_request.base_url
    LEFT JOIN grobid ON
        grobid.sha1hex = ingest_file_result.terminal_sha1hex
    WHERE
        ingest_request.link_source = 'spn'
        AND ingest_request.ingest_type = 'pdf'
        AND ingest_request.base_url like 'https://andrzejklimczuk.com/%';
    => 589

    SELECT ingest_file_result.status, COUNT(*) from ingest_request
    LEFT JOIN ingest_file_result ON
        ingest_file_result.ingest_type = ingest_request.ingest_type
        AND ingest_file_result.base_url = ingest_request.base_url
    LEFT JOIN grobid ON
        grobid.sha1hex = ingest_file_result.terminal_sha1hex
    WHERE
        ingest_request.link_source = 'spn'
        AND ingest_request.ingest_type = 'pdf'
        AND ingest_request.base_url like 'https://andrzejklimczuk.com/%'
    GROUP BY ingest_file_result.status;

         status     | count 
    ----------------+-------
     cdx-error      |     1
     success        |   587
     wrong-mimetype |     1
    (3 rows)


    SELECT grobid.status_code, COUNT(*) from ingest_request
    LEFT JOIN ingest_file_result ON
        ingest_file_result.ingest_type = ingest_request.ingest_type
        AND ingest_file_result.base_url = ingest_request.base_url
    LEFT JOIN grobid ON
        grobid.sha1hex = ingest_file_result.terminal_sha1hex
    WHERE
        ingest_request.link_source = 'spn'
        AND ingest_request.ingest_type = 'pdf'
        AND ingest_request.base_url like 'https://andrzejklimczuk.com/%'
    GROUP BY grobid.status_code;

     status_code | count 
    -------------+-------
             200 |   385
             500 |   202
                 |     2
    (3 rows)


    COPY (
        SELECT row_to_json(ingest_request.*) FROM ingest_request
        LEFT JOIN ingest_file_result ON
            ingest_file_result.ingest_type = ingest_request.ingest_type
            AND ingest_file_result.base_url = ingest_request.base_url
        LEFT JOIN grobid ON
            grobid.sha1hex = ingest_file_result.terminal_sha1hex
        WHERE
            ingest_request.link_source = 'spn'
            AND ingest_request.ingest_type = 'pdf'
            AND ingest_request.base_url like 'https://andrzejklimczuk.com/%'
            AND ingest_file_result.status = 'success'
            AND grobid.status_code = 500
    ) TO '/srv/sandcrawler/tasks/andrzejklimczuk_bad_spn.rows.json';
    => COPY 202

    COPY (
        SELECT row_to_json(ingest_request.*) FROM ingest_request
        LEFT JOIN ingest_file_result ON
            ingest_file_result.ingest_type = ingest_request.ingest_type
            AND ingest_file_result.base_url = ingest_request.base_url
        LEFT JOIN grobid ON
            grobid.sha1hex = ingest_file_result.terminal_sha1hex
        WHERE
            ingest_request.link_source = 'spn'
            AND ingest_request.ingest_type = 'pdf'
            AND ingest_request.base_url like 'https://andrzejklimczuk.com/%'
            AND ingest_file_result.status = 'success'
            AND grobid.status_code = 200
    ) TO '/srv/sandcrawler/tasks/andrzejklimczuk_good_spn.rows.json';
    => COPY 385

sudo -u sandcrawler pipenv run \
    ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/andrzejklimczuk_good_spn.rows.json \
    > /srv/sandcrawler/tasks/andrzejklimczuk_good_spn.json

sudo -u sandcrawler pipenv run \
    ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/andrzejklimczuk_bad_spn.rows.json \
    | jq '. + {force_recrawl: true}' -c \
    > /srv/sandcrawler/tasks/andrzejklimczuk_bad_spn.json

cat /srv/sandcrawler/tasks/andrzejklimczuk_bad_spn.json \
    | shuf \
    | head -n60000 \
    | jq . -c \
    | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-priority -p -1

cat /srv/sandcrawler/tasks/andrzejklimczuk_good_spn.json \
    | shuf \
    | head -n100 \
    | jq . -c \
    | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-priority -p -1

cat /srv/sandcrawler/tasks/andrzejklimczuk_good_spn.json \
    | shuf \
    | head -n10000 \
    | jq . -c \
    | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-priority -p -1

sudo -u sandcrawler pipenv run \
    ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/andrzejklimczuk_bad_spn.rows.json \
    > /srv/sandcrawler/tasks/andrzejklimczuk_bad2_spn.json

cat /srv/sandcrawler/tasks/andrzejklimczuk_bad2_spn.json \
    | shuf \
    | head -n60000 \
    | jq . -c \
    | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-priority -p -1