diff options
author | Bryan Newbold <bnewbold@archive.org> | 2019-06-20 15:38:11 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2019-06-20 15:38:11 -0700 |
commit | e0d9aaeedc9e8b9d791a72fc8e91a4869078d6f2 (patch) | |
tree | 26ee0883820850976c321aa11d825fe5bc5622b8 /notes | |
parent | 87603ba93b343c0c2b55d52c5f99697c06a672b4 (diff) | |
download | sandcrawler-e0d9aaeedc9e8b9d791a72fc8e91a4869078d6f2.tar.gz sandcrawler-e0d9aaeedc9e8b9d791a72fc8e91a4869078d6f2.zip |
petabox journal files ingest updates
Diffstat (limited to 'notes')
-rw-r--r-- | notes/petabox_ia_metadata.txt | 25 |
1 files changed, 25 insertions, 0 deletions
diff --git a/notes/petabox_ia_metadata.txt b/notes/petabox_ia_metadata.txt index 3a99805..f46ea61 100644 --- a/notes/petabox_ia_metadata.txt +++ b/notes/petabox_ia_metadata.txt @@ -29,3 +29,28 @@ Size/results: bnewbold@ia601101$ wc -l journals-ia.20181218.pdf-sha1.tsv 1748645 journals-ia.20181218.pdf-sha1.tsv +## June 2019 Ingest + + bnewbold@ia601101$ pwd + /schnell/iamine-journals + + zcat journals-ia.20181218.json.gz | rg '"identifier": "arxiv-' > arxiv.json + zcat journals-ia.20181218.json.gz | rg '"identifier": "jstor-' > jstor.json + zcat journals-ia.20181218.json.gz | rg '"identifier": "paper-doi-10_' > paper-doi.json + zcat journals-ia.20181218.json.gz | rg '"identifier": "pubmed-PMC' > pmc.json + + cat arxiv.json | ./ia_pdf_match.py > arxiv.match.json + cat jstor.json | ./ia_pdf_match.py > jstor.match.json + cat paper-doi.json | ./ia_pdf_match.py > paper-doi.match.json + cat pmc.json | ./ia_pdf_match.py > pmc.match.json + + bnewbold@ia601101$ wc -l arxiv.*json jstor.*json paper-doi.*json pmc.*json + 1076012 arxiv.json + 740970 arxiv.match.json + 451204 jstor.json + 451204 jstor.match.json + 77838 paper-doi.json + 23736 paper-doi.match.json + 209787 pmc.json + 189093 pmc.match.json + |