aboutsummaryrefslogtreecommitdiffstats
path: root/notes/petabox_ia_metadata.txt
diff options
context:
space:
mode:
Diffstat (limited to 'notes/petabox_ia_metadata.txt')
-rw-r--r--notes/petabox_ia_metadata.txt56
1 files changed, 56 insertions, 0 deletions
diff --git a/notes/petabox_ia_metadata.txt b/notes/petabox_ia_metadata.txt
new file mode 100644
index 0000000..f46ea61
--- /dev/null
+++ b/notes/petabox_ia_metadata.txt
@@ -0,0 +1,56 @@
+
+Ran in aitio:/schnell/iamine-journals in December 2018.
+
+Output uploaded to https://archive.org/details/ia-petabox-journal-metadata-2018
+
+Commands:
+
+ # didn't work!
+ #ia-mine --search collection:journals --itemlist > journals.20181218.itemlist
+
+ # fetched manually via metamgr, using prefix matches
+ cat metamgr-* > metamgr-journals-loose.20181218.items
+
+ ia-mine metamgr-journals-loose.20181218.items > journals.20181218.json
+
+ export LC_ALL=C
+ cat journals-ia.20181218.json | jq 'select(.files) | .files[] | select(.format == "Text PDF") | .sha1' -r | sort -S 4G -u > journals-ia.20181218.pdf-sha1.tsv
+
+Size/results:
+
+ bnewbold@ia601101$ wc -l journals-ia.20181218.json metamgr-journals-loose.20181218.items
+ 2043877 journals-ia.20181218.json
+ 2044362 metamgr-journals-loose.20181218.items
+
+ # missed about 500; meh
+
+ -rw-rw-r-- 1 bnewbold bnewbold 9.5G Dec 19 23:26 journals-ia.20181218.json
+
+ bnewbold@ia601101$ wc -l journals-ia.20181218.pdf-sha1.tsv
+ 1748645 journals-ia.20181218.pdf-sha1.tsv
+
+## June 2019 Ingest
+
+ bnewbold@ia601101$ pwd
+ /schnell/iamine-journals
+
+ zcat journals-ia.20181218.json.gz | rg '"identifier": "arxiv-' > arxiv.json
+ zcat journals-ia.20181218.json.gz | rg '"identifier": "jstor-' > jstor.json
+ zcat journals-ia.20181218.json.gz | rg '"identifier": "paper-doi-10_' > paper-doi.json
+ zcat journals-ia.20181218.json.gz | rg '"identifier": "pubmed-PMC' > pmc.json
+
+ cat arxiv.json | ./ia_pdf_match.py > arxiv.match.json
+ cat jstor.json | ./ia_pdf_match.py > jstor.match.json
+ cat paper-doi.json | ./ia_pdf_match.py > paper-doi.match.json
+ cat pmc.json | ./ia_pdf_match.py > pmc.match.json
+
+ bnewbold@ia601101$ wc -l arxiv.*json jstor.*json paper-doi.*json pmc.*json
+ 1076012 arxiv.json
+ 740970 arxiv.match.json
+ 451204 jstor.json
+ 451204 jstor.match.json
+ 77838 paper-doi.json
+ 23736 paper-doi.match.json
+ 209787 pmc.json
+ 189093 pmc.match.json
+