diff options
| -rw-r--r-- | notes/ingest/2020-04_unpaywall.md | 44 | ||||
| -rw-r--r-- | notes/ingest/2020-05_oai_pmh.md | 38 | 
2 files changed, 76 insertions, 6 deletions
| diff --git a/notes/ingest/2020-04_unpaywall.md b/notes/ingest/2020-04_unpaywall.md index bce757b..c900970 100644 --- a/notes/ingest/2020-04_unpaywall.md +++ b/notes/ingest/2020-04_unpaywall.md @@ -52,6 +52,8 @@ Second time:      ) TO '/grande/snapshots/unpaywall_noingest_2020-04-08.rows.json';      => 3696189 +    WARNING: forgot to transform from rows to ingest requests. +      cat /grande/snapshots/unpaywall_noingest_2020-04-08.rows.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1  Second time: @@ -70,6 +72,8 @@ Second time:      ) TO '/grande/snapshots/unpaywall_noingest_2020-05-03.rows.json';      => 1799760 +    WARNING: forgot to transform from rows to ingest requests. +      cat /grande/snapshots/unpaywall_noingest_2020-05-03.rows.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1  ## Dump no-capture, Run Crawl @@ -113,17 +117,49 @@ or may not bother trying to ingest (due to expectation of failure).      ) TO '/grande/snapshots/unpaywall_nocapture_2020-05-04.rows.json';      => 2602408 +NOTE: forgot here to transform from "rows" to ingest requests. +  Not actually a very significant size difference after all.  See `journal-crawls` repo for details on seedlist generation and crawling.  ## Re-Ingest Post-Crawl -Test small batch: +NOTE: if we *do* want to do cleanup eventually, could look for fatcat edits +between 2020-04-01 and 2020-05-25 which have limited "extra" metadata (eg, no +evidence or `oa_status`). + +The earlier bulk ingests were done wrong (forgot to transform from rows to full +ingest request docs), so going to re-do those, which should be a superset of +the nocapture crawl URLs.: + +    ./scripts/ingestrequest_row2json.py /grande/snapshots/unpaywall_noingest_2020-04-08.rows.json | pv -l > /grande/snapshots/unpaywall_noingest_2020-04-08.json +    => 1.26M 0:00:58 [21.5k/s] +    => previously: 3,696,189 + +    ./scripts/ingestrequest_row2json.py /grande/snapshots/unpaywall_noingest_2020-05-03.rows.json | pv -l > /grande/snapshots/unpaywall_noingest_2020-05-03.json +    => 1.26M 0:00:56 [22.3k/s] + +Crap, looks like the 2020-04-08 segment got overwriten with 2020-05 data by +accident. Hrm... need to re-ingest *all* recent unpaywall URLs: + +    COPY ( +        SELECT row_to_json(ingest_request.*) +        FROM ingest_request +        WHERE +            ingest_request.ingest_type = 'pdf' +            AND ingest_request.link_source = 'unpaywall' +            AND date(ingest_request.created) > '2020-04-01' +    ) TO '/grande/snapshots/unpaywall_all_recent_requests_2020-05-26.rows.json'; +    => COPY 5691106 -    zcat /grande/snapshots/unpaywall_nocapture_all_2020-05-04.rows.json.gz | head -n200 | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 +    ./scripts/ingestrequest_row2json.py /grande/snapshots/unpaywall_all_recent_requests_2020-05-26.rows.json | pv -l | shuf > /grande/snapshots/unpaywall_all_recent_requests_2020-05-26.requests.json +    => 5.69M 0:04:26 [21.3k/s] +    +Start small: -Run the whole batch: +    cat /grande/snapshots/unpaywall_all_recent_requests_2020-05-26.requests.json | head -n200 | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 -    zcat /grande/snapshots/unpaywall_nocapture_all_2020-05-04.rows.json.gz | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 +Looks good (whew), run the full thing: +    cat /grande/snapshots/unpaywall_all_recent_requests_2020-05-26.requests.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 diff --git a/notes/ingest/2020-05_oai_pmh.md b/notes/ingest/2020-05_oai_pmh.md index 4cfd8d5..37e7dfc 100644 --- a/notes/ingest/2020-05_oai_pmh.md +++ b/notes/ingest/2020-05_oai_pmh.md @@ -118,8 +118,42 @@ Dump ingest requests:              AND ingest_request.link_source = 'oai'              AND date(ingest_request.created) > '2020-05-01'              AND ingest_file_result.status IS NULL -    ) TO '/grande/snapshots/oai_noingest_20200506.requests.json'; +    ) TO '/grande/snapshots/oai_noingest_20200506.rows.json';      => COPY 49491452 -    cat /grande/snapshots/oai_noingest_20200506.requests.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 +    WARNING: should have transformed from rows to requests here + +    cat /grande/snapshots/oai_noingest_20200506.rows.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +## Crawl and re-ingest + +Updated stats after ingest (NOTE: ingest requests not really formed correctly, +but doesn't matter because fatcat wasn't importing these anyways): + +    SELECT ingest_file_result.status, COUNT(*) +    FROM ingest_request +    LEFT JOIN ingest_file_result +        ON ingest_file_result.ingest_type = ingest_request.ingest_type +        AND ingest_file_result.base_url = ingest_request.base_url +    WHERE  +        ingest_request.ingest_type = 'pdf' +        AND ingest_request.link_source = 'oai' +    GROUP BY status +    ORDER BY COUNT DESC +    LIMIT 20; + +Dump again for crawling: + +    COPY ( +        SELECT row_to_json(ingest_request.*) +        FROM ingest_request +        LEFT JOIN ingest_file_result +            ON ingest_file_result.ingest_type = ingest_request.ingest_type +            AND ingest_file_result.base_url = ingest_request.base_url +        WHERE +            ingest_request.ingest_type = 'pdf' +            AND ingest_request.link_source = 'oai' +            AND date(ingest_request.created) > '2020-05-01' +            AND (ingest_file_result.status = 'no-capture' or ingest_file_result.status = 'cdx-error') +    ) TO '/grande/snapshots/oai_tocrawl_20200526.rows.json'; | 
