Heritrix follow-up crawl for recent bulk ingest of DOAJ, JALC, and DBLP URLs.

    export PATCHDATE=2022-07-29
    export CRAWLVM=wbgrp-svc279.us.archive.org
    export CRAWLNAME=TARGETED-ARTICLE-CRAWL-2022-07

## Seedlist Query

Terminal URLs dump:

    COPY (
        SELECT row_to_json(t) FROM (
            SELECT ingest_file_result.terminal_url, ingest_request.*
            FROM ingest_request
            LEFT JOIN ingest_file_result
                ON ingest_file_result.ingest_type = ingest_request.ingest_type
                AND ingest_file_result.base_url = ingest_request.base_url
            WHERE
                (
                    ingest_request.ingest_type = 'pdf'
                    OR ingest_request.ingest_type = 'html'
                )
                -- AND ingest_file_result.updated >= '2022-01-12'
                AND (
                    ingest_file_result.status = 'no-capture'
                    OR ingest_file_result.status = 'cdx-error'
                    OR ingest_file_result.status = 'wayback-error'
                    OR ingest_file_result.status = 'wayback-content-error'
                    OR ingest_file_result.status = 'petabox-error'
                    OR ingest_file_result.status LIKE 'spn2-%'
                    OR ingest_file_result.status = 'gateway-timeout'
                    OR (
                        ingest_file_result.status = 'terminal-bad-status'
                        AND (
                            ingest_file_result.terminal_status_code = 500
                            OR ingest_file_result.terminal_status_code = 502
                            OR ingest_file_result.terminal_status_code = 503
                            OR ingest_file_result.terminal_status_code = 429
                        )
                    )
                )
                AND (
                    ingest_request.link_source = 'doi'
                    OR ingest_request.link_source = 'doaj'
                    OR ingest_request.link_source = 'dblp'
                    OR ingest_request.link_source = 'arxiv'
                    OR ingest_request.link_source = 'pmc'
                    -- OR ingest_request.link_source = 'unpaywall'
                    -- OR ingest_request.link_source = 'oai'
                )

                AND ingest_file_result.terminal_url NOT LIKE '%mdz-nbn-resolving.de%'
                AND ingest_file_result.terminal_url NOT LIKE '%edoc.mpg.de%'
                AND ingest_file_result.terminal_url NOT LIKE '%orcid.org%'
                AND ingest_file_result.terminal_url NOT LIKE '%gateway.isiknowledge.com%'
                AND ingest_file_result.terminal_url NOT LIKE '%europmc.org%'
                AND ingest_file_result.terminal_url NOT LIKE '%arxiv.org%'
                -- AND ingest_file_result.terminal_url NOT LIKE 'https://doi.org/10.%'

                AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
                AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
                AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
                AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
                AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
                AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
                AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
                AND ingest_file_result.terminal_url NOT LIKE '%researchgate.net%'
                AND ingest_file_result.terminal_url NOT LIKE '%muse.jhu.edu%'
                AND ingest_file_result.terminal_url NOT LIKE '%omicsonline.org%'
                AND ingest_file_result.terminal_url NOT LIKE '%link.springer.com%'
                AND ingest_file_result.terminal_url NOT LIKE '%ieeexplore.ieee.org%'

                -- AND ingest_file_result.terminal_url NOT LIKE '%zenodo.org%'
                AND ingest_file_result.terminal_url NOT LIKE '%t2r2.star.titech.ac.jp%'
                AND ingest_file_result.terminal_url NOT LIKE '%www.google.com%'
                -- AND ingest_file_result.terminal_url NOT LIKE '%figshare.com%'
                -- AND ingest_file_result.terminal_url NOT LIKE '%springernature.figshare.com%'
                AND ingest_file_result.terminal_url NOT LIKE '%www.archive.org%'
        ) t
    ) TO '/srv/sandcrawler/tasks/patch_ingest_request_2022-07-29.rows.json';
    => COPY 3524573

    cat /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.rows.json \
        | rg -v "\\\\" \
        | jq -r .terminal_url \
        | rg '://' \
        | rg -i '^http' \
        | rg -v '://10\.' \
        | rg -v '://172\.' \
        | sort -u -S 4G \
        | pv -l \
        > /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.txt
    => 3.11M 0:01:08 [45.4k/s]

    # check top domains
    cut -f3 -d/ /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.txt | sort | uniq -c | sort -nr | head -n25
     624948 doi.org
     382492 www.jstage.jst.go.jp
     275087 www.mdpi.com
     157134 www.persee.fr
     108979 www.sciencedirect.com
      94375 www.scielo.br
      50834 onlinelibrary.wiley.com
      49991 journals.lww.com
      30354 www.frontiersin.org
      27963 doaj.org
      27058 www.e-periodica.ch
      24147 dl.acm.org
      23389 aclanthology.org
      22086 www.research-collection.ethz.ch
      21589 medien.die-bonn.de
      18866 www.ingentaconnect.com
      18583 doi.nrct.go.th
      18271 repositories.lib.utexas.edu
      17634 hdl.handle.net
      16366 archives.datapages.com
      15146 cgscholar.com
      13987 dl.gi.de
      13188 www.degruyter.com
      12503 ethos.bl.uk
      12304 preprints.jmir.org

    cat /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.txt | awk '{print "F+ " $1}' > /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.schedule
    => done

    scp /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.schedule $CRAWLVM:/tmp
    ssh $CRAWLVM sudo -u heritrix cp /tmp/patch_terminal_url.$PATCHDATE.schedule /0/ia-jobs/journal-crawls/$CRAWLNAME/action/


## Re-Ingest

Transform:

    ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.requests.json
    => 3.52M 0:01:37 [36.2k/s]

Ingest:

    cat /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.requests.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1