This is just a load and bulk ingest; will do a separate 'TARGETED' crawl for
heritrix bulk crawling, along with JALC and DOAJ URLs.

    export SNAPSHOT=2022-07-20

## Transform and Load

    # on sandcrawler-vm
    mkdir -p /srv/sandcrawler/tasks/doaj
    cd /srv/sandcrawler/tasks/doaj
    wget "https://archive.org/download/doaj_data_${SNAPSHOT}/doaj_article_data_${SNAPSHOT}_all.json.gz"

    # in pipenv, in python directory
    zcat /srv/sandcrawler/tasks/doaj/doaj_article_data_${SNAPSHOT}_all.json.gz | ./scripts/doaj2ingestrequest.py - | pv -l | gzip > /srv/sandcrawler/tasks/doaj/doaj_article_data_${SNAPSHOT}_all.ingest_request.json.gz
    # 9.72M 0:36:28 [4.44k/s]

    zcat /srv/sandcrawler/tasks/doaj/doaj_article_data_${SNAPSHOT}_all.ingest_request.json.gz | pv -l | ./persist_tool.py ingest-request -
    # 9.72M 0:17:04 [9.49k/s]
    # Worker: Counter({'total': 9721097, 'insert-requests': 809681, 'update-requests': 0})
    # JSON lines pushed: Counter({'total': 9721097, 'pushed': 9721097})

Stats after this load:

    SELECT ingest_request.ingest_type, ingest_file_result.status, COUNT(*)
    FROM ingest_request
    LEFT JOIN ingest_file_result
        ON ingest_file_result.ingest_type = ingest_request.ingest_type
        AND ingest_file_result.base_url = ingest_request.base_url
    WHERE 
        ingest_request.link_source = 'doaj'
    GROUP BY ingest_request.ingest_type, status
    -- next time include ingest_type in sort
    ORDER BY COUNT DESC
    LIMIT 30;

     ingest_type |          status          |  count
    -------------+--------------------------+---------
     pdf         | success                  | 3165539
     pdf         |                          | 2078874
     html        |                          | 1547698
     html        | wrong-scope              | 1114332
     pdf         | no-pdf-link              |  517261
     html        | success                  |  388376
     html        | unknown-scope            |  242044
     pdf         | no-capture               |  179030
     pdf         | terminal-bad-status      |  174741
     html        | no-capture               |  155323
     pdf         | null-body                |  129267
     pdf         | redirect-loop            |  127136
     html        | html-resource-no-capture |  117275
     html        | null-body                |  100296
     pdf         | blocked-cookie           |   71093
     html        | redirect-loop            |   65519
     html        | terminal-bad-status      |   64856
     html        | blocked-cookie           |   64095
     html        | spn2-backoff             |   55173
     pdf         | link-loop                |   27440
     html        | wrong-mimetype           |   26016
     html        | wayback-content-error    |   20109
     xml         |                          |   13624
     pdf         | wrong-mimetype           |    8411
     xml         | success                  |    6899
     html        | petabox-error            |    6199
     html        | wayback-error            |    5269
     html        | spn2-cdx-lookup-failure  |    4635
     html        | spn2-recent-capture      |    4527
     xml         | null-body                |    2353
    (30 rows)

## Bulk Ingest

    COPY (
        SELECT row_to_json(t1.*)
        FROM (
            SELECT ingest_request.*, ingest_file_result as result
            FROM ingest_request
            LEFT JOIN ingest_file_result
                ON ingest_file_result.base_url = ingest_request.base_url
                AND ingest_file_result.ingest_type = ingest_request.ingest_type
            WHERE
                ingest_request.link_source = 'doaj'
                -- AND (ingest_request.ingest_type = 'pdf'
                --    OR ingest_request.ingest_type = 'xml')
                AND (
                    ingest_file_result.status IS NULL
                    OR ingest_file_result.status = 'no-capture'
                )
                AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
                AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
                AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
                AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
                AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
                AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
                AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
                AND ingest_request.base_url NOT LIKE '%://archive.org/%'
                AND ingest_request.base_url NOT LIKE '%://web.archive.org/%'
                AND ingest_request.base_url NOT LIKE '%://www.archive.org/%'
                -- AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
                -- AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
                -- AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
                -- AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
                -- AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
                -- AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
                -- AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
                -- AND ingest_file_result.terminal_url NOT LIKE '%://archive.org/%'
                -- AND ingest_file_result.terminal_url NOT LIKE '%://web.archive.org/%'
                -- AND ingest_file_result.terminal_url NOT LIKE '%://www.archive.org/%'
        ) t1
    ) TO '/srv/sandcrawler/tasks/doaj_seedlist_2022-07-20.rows.json';
    # COPY 3962331

Transform:

    ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/doaj_seedlist_${SNAPSHOT}.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/doaj_seedlist_${SNAPSHOT}.requests.json
    # 3.96M 0:01:47 [36.7k/s]

Top domains:

    cat /srv/sandcrawler/tasks/doaj_seedlist_${SNAPSHOT}.requests.json | jq .base_url -r | cut -f3 -d/ | sort | uniq -c | sort -nr | head -n20
     789988 www.mdpi.com
     318142 www.frontiersin.org
     226316 link.springer.com
     204429 www.scielo.br
     201175 www.sciencedirect.com
      72852 ieeexplore.ieee.org
      68983 dx.doi.org
      33286 www.dovepress.com
      26020 elifesciences.org
      23838 www.cetjournal.it
      21102 mab-online.nl
      20242 www.revistas.usp.br
      16564 periodicos.uem.br
      15710 journals.openedition.org
      14514 dergipark.org.tr
      14072 apcz.umk.pl
      13924 ojs.minions.amsterdam
      13717 bmgn-lchr.nl
      13512 ojstest.minions.amsterdam
      10440 journals.asm.org

Bulk ingest:

    cat /srv/sandcrawler/tasks/doaj_seedlist_${SNAPSHOT}.requests.json | rg -v "dx.doi.org" | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
    # Done

## Stats Again

    SELECT ingest_request.ingest_type, ingest_file_result.status, COUNT(*)
    FROM ingest_request
    LEFT JOIN ingest_file_result
        ON ingest_file_result.ingest_type = ingest_request.ingest_type
        AND ingest_file_result.base_url = ingest_request.base_url
    WHERE 
        ingest_request.link_source = 'doaj'
    GROUP BY ingest_request.ingest_type, status
    -- ORDER BY ingest_request.ingest_type, COUNT DESC
    ORDER BY COUNT DESC
    LIMIT 30;


     ingest_type |          status          |  count
    -------------+--------------------------+---------
     pdf         | success                  | 4704006
     html        | wrong-scope              | 1761227
     html        | success                  |  778165
     pdf         | no-pdf-link              |  759805
     html        | no-capture               |  382080
     html        | unknown-scope            |  313391
     html        | html-resource-no-capture |  292953
     pdf         | no-capture               |  290311
     pdf         | terminal-bad-status      |  271776
     pdf         | null-body                |  129267
     pdf         | blocked-cookie           |  108491
     html        | terminal-bad-status      |  103014
     html        | null-body                |  100296
     html        | blocked-cookie           |   88533
     pdf         |                          |   81517
     pdf         | skip-url-blocklist       |   76443
     html        | spn2-backoff             |   50615
     pdf         | link-loop                |   45516
     html        | wrong-mimetype           |   33525
     html        | wayback-content-error    |   25535
     pdf         | empty-blob               |   21431
     pdf         | redirect-loop            |   19795
     html        | petabox-error            |   18291
     html        | empty-blob               |   14391
     pdf         | wrong-mimetype           |   14084
     html        | redirect-loop            |   12856
     xml         | success                  |   10381
     xml         | no-capture               |   10008
     html        | skip-url-blocklist       |    3294
     html        | cdx-error                |    3275
    (30 rows)

Pretty good success rate for PDFs. That is a lot of `no-capture`! And why 81k
PDFs with no attempt at all? Maybe a filter, or bogus URLs.

Over 1.5M new PDF success over this crawl iteration period, nice.