diff options
Diffstat (limited to 'notes')
| -rw-r--r-- | notes/ingest/2021-09-02_oai_pmh_patch.md | 4 | ||||
| -rw-r--r-- | notes/ingest/2022-03_oaipmh.md | 40 | ||||
| -rw-r--r-- | notes/ingest/2022-09_oaipmh.md | 349 | 
3 files changed, 391 insertions, 2 deletions
| diff --git a/notes/ingest/2021-09-02_oai_pmh_patch.md b/notes/ingest/2021-09-02_oai_pmh_patch.md index fded7b3..ac808dd 100644 --- a/notes/ingest/2021-09-02_oai_pmh_patch.md +++ b/notes/ingest/2021-09-02_oai_pmh_patch.md @@ -1506,8 +1506,8 @@ possible to detect these at ingest time, or earlier at OAI-PMH  harvest/transform time and filter them out.  It may be worthwhile to attempt ingest of multiple existing captures -(timestamps) in the ingest pipeline.  Eg, isntead of chosing a single "best" -capture, if therea are multiple HTTP 200 status captures, try ingest with each +(timestamps) in the ingest pipeline.  Eg, instead of chosing a single "best" +capture, if there are multiple HTTP 200 status captures, try ingest with each  (or at least a couple).  This is because repository software gets upgraded, so  old "no-capture" or "not found" or "link loop" type captures may work when  recrawled. diff --git a/notes/ingest/2022-03_oaipmh.md b/notes/ingest/2022-03_oaipmh.md new file mode 100644 index 0000000..d2a8d71 --- /dev/null +++ b/notes/ingest/2022-03_oaipmh.md @@ -0,0 +1,40 @@ + +Martin did a fresh scrape of many OAI-PMH endpoints, and we should ingest/crawl. + +Note that Martin excluded many Indonesian endpoints, will need to follow-up on +those. + +## Prep + +Fetch metadata snapshot: + +    wget https://archive.org/download/oai_pmh_partial_dump_2022_03_01/oai_pmh_partial_dump_2022_03_01.ndj.zst + +    wget https://archive.org/download/oai_pmh_partial_dump_2022_03_01/oai_pmh_partial_dump_2022_03_01_urls.txt.zst + +Pre-filter out a bunch of prefixes we won't crawl (out of scope, and large): + +    zstdcat /srv/sandcrawler/tasks/oai-pmh/oai_pmh_partial_dump_2022_03_01.ndj.zst \ +        | rg -v 'oai:kb.dk:' \ +        | rg -v 'oai:bdr.oai.bsb-muenchen.de:' \ +        | rg -v 'oai:hispana.mcu.es:' \ +        | rg -v 'oai:bnf.fr:' \ +        | rg -v 'oai:ukm.si:' \ +        | rg -v 'oai:biodiversitylibrary.org:' \ +        | rg -v 'oai:hsp.org:' \ +        | rg -v 'oai:repec:' \ +        | rg -v 'oai:n/a:' \ +        | rg -v 'oai:quod.lib.umich.edu:' \ +        | rg -v 'oai:americanae.aecid.es:' \ +        | rg -v 'oai:www.irgrid.ac.cn:' \ +        | rg -v 'oai:espace.library.uq.edu:' \ +        | rg -v 'oai:edoc.mpg.de:' \ +        | rg -v 'oai:bibliotecadigital.jcyl.es:' \ +        | rg -v 'oai:repository.erciyes.edu.tr:' \ +        | rg -v 'oai:krm.or.kr:' \ +        | ./scripts/oai2ingestrequest.py - \ +        | pv -l \ +        | gzip \ +        > /srv/sandcrawler/tasks/oai-pmh/oai_pmh_partial_dump_2022_03_01.requests.json.gz + +These failed to transform in the expected way; a change in JSON schema from last time? diff --git a/notes/ingest/2022-09_oaipmh.md b/notes/ingest/2022-09_oaipmh.md new file mode 100644 index 0000000..0aa4487 --- /dev/null +++ b/notes/ingest/2022-09_oaipmh.md @@ -0,0 +1,349 @@ + +Martin did another OAI-PMH bulk crawl, this time with the old JSON format: <https://archive.org/download/oai_harvest_20220921> + +I updated the transform script to block some additional domains. + + +## Prep + +Fetch the snapshot: + +    cd /srv/sandcrawler/tasks/ +    wget https://archive.org/download/oai_harvest_20220921/2022-09-21-oai-pmh-metadata-compat.jsonl.zst + +Transform to ingest requests: + +    cd /srv/sandcrawler/src/python +    git log | head -n1 +    # commit dfd4605d84712eccb95a63e50b0bcb343642b433 + +    pipenv shell +    zstdcat /srv/sandcrawler/tasks/2022-09-21-oai-pmh-metadata-compat.jsonl.zst \ +        | ./scripts/oai2ingestrequest.py - \ +        | pv -l \ +        | gzip \ +        > /srv/sandcrawler/tasks/2022-09-21_oaipmh_ingestrequests.json.gz +    # 16.1M 1:01:02 [4.38k/s] + +Curious about types, though this would probably be handled at fatcat ingest +time: + +    zstdcat 2022-09-21-oai-pmh-metadata-compat.jsonl.zst | jq '.types[]' -r | sort | uniq -c | sort -nr > oai_type_counts.txt + +    head oai_type_counts.txt -n30 +    5623867 info:eu-repo/semantics/article +    5334928 info:eu-repo/semantics/publishedVersion +    3870359 text +    1240225 Text +     829169 Article +     769849 NonPeerReviewed +     665700 PeerReviewed +     648740 Peer-reviewed Article +     547857 article +     482906 info:eu-repo/semantics/bachelorThesis +     353814 Thesis +     329269 Student thesis +     262650 info:eu-repo/semantics/conferenceObject +     185354 Journal articles +     162021 info:eu-repo/semantics/doctoralThesis +     152079 Journal Article +     150226 Research Article +     130217 Conference papers +     127255 ArtÃculo revisado por pares +     124243 Newspaper +     123908 ##rt.metadata.pkp.peerReviewed## +     123309 Photograph +     122981 info:eu-repo/semantics/masterThesis +     116719 Book +     108946 Image +     108216 Report +     107946 Other +     103562 masterThesis +     103038 info:eu-repo/semantics/other +     101404 StillImage +    [...] + +And formats: + +    zstdcat 2022-09-21-oai-pmh-metadata-compat.jsonl.zst | jq '.formats[]' -r | sort | uniq -c | sort -nr > oai_format_counts.txt + +    head -n 20 oai_format_counts.txt  +    11151928 application/pdf +     677413 text +     561656 text/html +     498518 image/jpeg +     231219 Text +     193638 text/xml +     147214 Image +     117073 image/jpg +     110872 pdf +      91323 image/tiff +      76948 bib +      75393 application/xml +      70244 Digitized from 35 mm. microfilm. +      68206 mods +      59227 PDF +      57677 application/epub+zip +      57602 application/octet-stream +      52072 text/plain +      51620 application/msword +      47227 audio/mpeg + +Also, just overall size (number of records): + +    zstdcat 2022-09-21-oai-pmh-metadata-compat.jsonl.zst | wc -l +    # 20,840,301 + +Next load in to sandcrawler DB: + +    zcat /srv/sandcrawler/tasks/2022-09-21_oaipmh_ingestrequests.json.gz | pv -l | ./persist_tool.py ingest-request - + +    Traceback (most recent call last): +      File "./persist_tool.py", line 311, in <module> +        main() +      File "./persist_tool.py", line 307, in main +        args.func(args) +      File "./persist_tool.py", line 119, in run_ingest_request +        pusher.run() +      File "/1/srv/sandcrawler/src/python/sandcrawler/workers.py", line 397, in run +        self.worker.push_batch(batch) +      File "/1/srv/sandcrawler/src/python/sandcrawler/persist.py", line 342, in push_batch +        resp = self.db.insert_ingest_request(self.cur, irequests) +      File "/1/srv/sandcrawler/src/python/sandcrawler/db.py", line 459, in insert_ingest_request +        resp = psycopg2.extras.execute_values(cur, sql, rows, page_size=250, fetch=True) +      File "/1/srv/sandcrawler/src/python/.venv/lib/python3.8/site-packages/psycopg2/extras.py", line 1270, in execute_values +        cur.execute(b''.join(parts)) +        psycopg2.errors.ProgramLimitExceeded: index row size 3400 exceeds btree version 4 maximum 2704 for index "ingest_request_base_url_idx" +        DETAIL:  Index row references tuple (6893121,3) in relation "ingest_request". +        HINT:  Values larger than 1/3 of a buffer page cannot be indexed. +        Consider a function index of an MD5 hash of the value, or use full text indexing. +    15.7M 0:41:48 [6.27k/s] + +Darn, this means we won't get reasonable stats about how many rows were +inserted/updated. + +Patched the persist tool to skip very long URLs, and ran again (backwards, just +URLs which didn't get inserted already): + +    zcat /srv/sandcrawler/tasks/2022-09-21_oaipmh_ingestrequests.json.gz \ +        | tac \ +        | head -n1000000 \ +        | pv -l \ +        | ./persist_tool.py ingest-request - +    # 1.00M 0:03:04 [5.41k/s] +    # Worker: Counter({'total': 1000000, 'insert-requests': 124701, 'skip-url-too-long': 1, 'update-requests': 0}) + +Status of just the new lines: + +    SELECT ingest_file_result.status, COUNT(*) +    FROM ingest_request +    LEFT JOIN ingest_file_result +        ON ingest_file_result.ingest_type = ingest_request.ingest_type +        AND ingest_file_result.base_url = ingest_request.base_url +    WHERE  +        ingest_request.ingest_type = 'pdf' +        AND ingest_request.link_source = 'oai' +        AND date(ingest_request.created) > '2022-09-01' +    GROUP BY status +    ORDER BY COUNT DESC +    LIMIT 20; + + +             status          |  count +    -------------------------+--------- +                             | 6398455 +     success                 |  540219 +     no-pdf-link             |   41316 +     link-loop               |   23871 +     no-capture              |   11350 +     redirect-loop           |    8315 +     wrong-mimetype          |    2394 +     terminal-bad-status     |    1540 +     null-body               |    1038 +     cdx-error               |     272 +     empty-blob              |     237 +     petabox-error           |     213 +     wayback-error           |     186 +     blocked-cookie          |     107 +     timeout                 |      47 +     wayback-content-error   |      26 +     spn2-cdx-lookup-failure |      21 +     skip-url-blocklist      |      16 +     spn2-backoff            |      15 +     body-too-large          |      13 +    (20 rows) + + +## Bulk Ingest + +Should already have filtered domains/prefixes in transform script, so not +including filters here. + +    COPY ( +        SELECT row_to_json(ingest_request.*) +        FROM ingest_request +        LEFT JOIN ingest_file_result +            ON ingest_file_result.ingest_type = ingest_request.ingest_type +            AND ingest_file_result.base_url = ingest_request.base_url +        WHERE +            ingest_request.ingest_type = 'pdf' +            AND ingest_request.link_source = 'oai' +            AND date(ingest_request.created) > '2022-09-01' +            AND ingest_file_result.status IS NULL +    ) TO '/srv/sandcrawler/tasks/oai_noingest_20220921.rows.json'; +    # COPY 6398455 + +    ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/oai_noingest_20220921.rows.json \ +        | pv -l \ +        | shuf \ +        > /srv/sandcrawler/tasks/oai_noingest_20220921.ingest_request.json +    # 6.40M 0:02:18 [46.2k/s] + +    cat /srv/sandcrawler/tasks/oai_noingest_20220921.ingest_request.json \ +        | rg -v "\\\\" \ +        | jq . -c \ +        | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 +    # DONE + +Expect this ingest to take a week or so. + +Then, run stats again: + +    SELECT ingest_file_result.status, COUNT(*) +    FROM ingest_request +    LEFT JOIN ingest_file_result +        ON ingest_file_result.ingest_type = ingest_request.ingest_type +        AND ingest_file_result.base_url = ingest_request.base_url +    WHERE  +        ingest_request.ingest_type = 'pdf' +        AND ingest_request.link_source = 'oai' +        AND date(ingest_request.created) > '2022-09-01' +    GROUP BY status +    ORDER BY COUNT DESC +    LIMIT 20; + +             status          |  count   +    -------------------------+--------- +     no-capture              | 3617175 +     success                 | 2775036 +     no-pdf-link             |  449298 +     link-loop               |   74260 +     terminal-bad-status     |   47819 +     wrong-mimetype          |   20195 +     redirect-loop           |   18197 +     empty-blob              |   12127 +     cdx-error               |    3038 +     skip-url-blocklist      |    2630 +     wayback-error           |    2599 +     petabox-error           |    2354 +     wayback-content-error   |    1617 +     blocked-cookie          |    1293 +     null-body               |    1038 +     body-too-large          |     670 +                             |     143 +     bad-gzip-encoding       |      64 +     timeout                 |      47 +     spn2-cdx-lookup-failure |      20 +    (20 rows) + + +## Crawl Seedlist + +    COPY ( +        SELECT row_to_json(ingest_request.*) +        FROM ingest_request +        LEFT JOIN ingest_file_result +            ON ingest_file_result.ingest_type = ingest_request.ingest_type +            AND ingest_file_result.base_url = ingest_request.base_url +        WHERE +            ingest_request.ingest_type = 'pdf' +            AND ingest_request.link_source = 'oai' +            AND date(ingest_request.created) > '2022-09-01' +            AND ( +                ingest_file_result.status = 'no-capture' +                OR ingest_file_result.status = 'redirect-loop' +                OR ingest_file_result.status = 'terminal-bad-status' +                OR ingest_file_result.status = 'cdx-error' +                OR ingest_file_result.status = 'petabox-error' +                OR ingest_file_result.status = 'wayback-error' +                OR ingest_file_result.status = 'timeout' +                OR ingest_file_result.status = 'wayback-content-error' +            ) +    ) TO '/srv/sandcrawler/tasks/oai_nocapture_20220921.rows.json'; +    => COPY 3692846 + +    ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/oai_nocapture_20220921.rows.json \ +        | pv -l \ +        | shuf \ +        > /srv/sandcrawler/tasks/oai_nocapture_20220921.ingest_request.json +    => 3.69M 0:01:19 [46.6k/s] + +This will be used for re-ingest later. For now, extract URLs: + +    cat /srv/sandcrawler/tasks/oai_nocapture_20220921.rows.json \ +        | jq .base_url -r \ +        | sort -u -S 4G \ +        | pv -l \ +        > /srv/sandcrawler/tasks/oai_nocapture_20220921.base_url.txt +    => 3.66M 0:00:59 [61.8k/s] + +    cat /srv/sandcrawler/tasks/oai_nocapture_20220921.rows.json \ +        | rg '"terminal_url"' \ +        | jq -r .result.terminal_url \ +        | rg -v ^null$ \ +        | sort -u -S 4G \ +        | pv -l \ +        > /srv/sandcrawler/tasks/oai_nocapture_20220921.terminal_url.txt +    => 0.00  0:00:05 [0.00 /s] + +    cat /srv/sandcrawler/tasks/oai_nocapture_20220921.base_url.txt /srv/sandcrawler/tasks/oai_nocapture_20220921.terminal_url.txt \ +        | awk '{print "F+ " $1}' \ +        | shuf \ +        > /srv/sandcrawler/tasks/oai_nocapture_20220921.schedule + +What domains are we crawling? + +    cat /srv/sandcrawler/tasks/oai_nocapture_20220921.base_url.txt /srv/sandcrawler/tasks/oai_nocapture_20220921.terminal_url.txt \ +        | sort -u -S 4G \ +        | cut -d/ -f3 \ +        | sort \ +        | uniq -c \ +        | sort -nr \ +        > /srv/sandcrawler/tasks/oai_nocapture_20220921.domains.txt + +    head -n20 /srv/sandcrawler/tasks/oai_nocapture_20220921.domains.txt +      91899 raco.cat +      70116 islandora.wrlc.org +      68708 urn.kb.se +      63726 citeseerx.ist.psu.edu +      50370 publications.rwth-aachen.de +      44885 urn.nsk.hr +      38429 server15795.contentdm.oclc.org +      33041 periodicos.ufpb.br +      32519 nbn-resolving.org +      31990 www.ajol.info +      24745 hal.archives-ouvertes.fr +      22569 id.nii.ac.jp +      17239 tilburguniversity.on.worldcat.org +      15873 dspace.nbuv.gov.ua +      15436 digitalcommons.wustl.edu +      14885 www.iiste.org +      14623 www.manchester.ac.uk +      14033 nbn-resolving.de +      13999 opus4.kobv.de +      13689 www.redalyc.org + +Sizes: + +    wc -l /srv/sandcrawler/tasks/oai_nocapture_20220921.base_url.txt /srv/sandcrawler/tasks/oai_nocapture_20220921.terminal_url.txt /srv/sandcrawler/tasks/oai_nocapture_20220921.schedule + +      3662864 /srv/sandcrawler/tasks/oai_nocapture_20220921.base_url.txt +            0 /srv/sandcrawler/tasks/oai_nocapture_20220921.terminal_url.txt +      3662864 /srv/sandcrawler/tasks/oai_nocapture_20220921.schedule + + +Copy seedlist to crawler: + +    # as regular user +    scp /srv/sandcrawler/tasks/oai_nocapture_20220921.schedule wbgrp-svc206.us.archive.org:/tmp + | 
