aboutsummaryrefslogtreecommitdiffstats
path: root/notes/petabox_ia_metadata.txt
blob: f46ea61563d8b1f4a39934a2503fbbda65d61816 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56

Ran in aitio:/schnell/iamine-journals in December 2018.

Output uploaded to https://archive.org/details/ia-petabox-journal-metadata-2018

Commands:

    # didn't work!
    #ia-mine --search collection:journals --itemlist > journals.20181218.itemlist   

    # fetched manually via metamgr, using prefix matches
    cat metamgr-* > metamgr-journals-loose.20181218.items

    ia-mine metamgr-journals-loose.20181218.items > journals.20181218.json

    export LC_ALL=C
    cat journals-ia.20181218.json | jq 'select(.files) | .files[] | select(.format == "Text PDF") | .sha1' -r | sort -S 4G -u > journals-ia.20181218.pdf-sha1.tsv

Size/results:

    bnewbold@ia601101$ wc -l journals-ia.20181218.json metamgr-journals-loose.20181218.items
    2043877 journals-ia.20181218.json
    2044362 metamgr-journals-loose.20181218.items

    # missed about 500; meh

    -rw-rw-r--  1 bnewbold bnewbold 9.5G Dec 19 23:26 journals-ia.20181218.json

    bnewbold@ia601101$ wc -l journals-ia.20181218.pdf-sha1.tsv 
    1748645 journals-ia.20181218.pdf-sha1.tsv

## June 2019 Ingest

    bnewbold@ia601101$ pwd
    /schnell/iamine-journals

    zcat journals-ia.20181218.json.gz | rg '"identifier": "arxiv-' > arxiv.json
    zcat journals-ia.20181218.json.gz | rg '"identifier": "jstor-' > jstor.json
    zcat journals-ia.20181218.json.gz | rg '"identifier": "paper-doi-10_' > paper-doi.json
    zcat journals-ia.20181218.json.gz | rg '"identifier": "pubmed-PMC' > pmc.json

    cat arxiv.json | ./ia_pdf_match.py > arxiv.match.json
    cat jstor.json | ./ia_pdf_match.py > jstor.match.json
    cat paper-doi.json | ./ia_pdf_match.py > paper-doi.match.json
    cat pmc.json | ./ia_pdf_match.py > pmc.match.json

    bnewbold@ia601101$ wc -l arxiv.*json jstor.*json paper-doi.*json pmc.*json 
        1076012 arxiv.json
         740970 arxiv.match.json
         451204 jstor.json
         451204 jstor.match.json
          77838 paper-doi.json
          23736 paper-doi.match.json
         209787 pmc.json
         189093 pmc.match.json