diff options
author | Bryan Newbold <bnewbold@archive.org> | 2018-12-19 17:28:27 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2018-12-19 17:28:27 -0800 |
commit | 15230fd7c76a8b3a7264a8b9d86c7633f1471e6a (patch) | |
tree | e998e50d4c00ae80669daf2572a8c72be4e4235f | |
parent | e8ba7a0bc8d4924f6601b4c82ead58e9f69d8aca (diff) | |
download | sandcrawler-15230fd7c76a8b3a7264a8b9d86c7633f1471e6a.tar.gz sandcrawler-15230fd7c76a8b3a7264a8b9d86c7633f1471e6a.zip |
notes on file-level metadata dump
-rw-r--r-- | notes/petabox_ia_metadata.txt | 31 |
1 files changed, 31 insertions, 0 deletions
diff --git a/notes/petabox_ia_metadata.txt b/notes/petabox_ia_metadata.txt new file mode 100644 index 0000000..df3bdc1 --- /dev/null +++ b/notes/petabox_ia_metadata.txt @@ -0,0 +1,31 @@ + +Ran in aitio:/schnell/iamine-journals in December 2018. + +Output uploaded to https://archive.org/details/ia-petabox-journal-metadata-2018 + +Commands: + + # didn't work! + #ia-mine --search collection:journals --itemlist > journals.20181218.itemlist + + # fetched manually via metamgr, using prefix matches + cat metamgr-* > metamgr-journals-loose.20181218.items + + ia-mine metamgr-journals-loose.20181218.items > journals.20181218.json + + export LC_ALL=C + cat journals-ia.20181218.json | jq 'select(.files) | .files[] | select(.format == "Text PDF") | .sha1' -r | sort -u > journals-ia.20181218.pdf-sha1.tsv + +Size/results: + + bnewbold@ia601101$ wc -l journals-ia.20181218.json metamgr-journals-loose.20181218.items + 2043877 journals-ia.20181218.json + 2044362 metamgr-journals-loose.20181218.items + + # missed about 500; meh + + -rw-rw-r-- 1 bnewbold bnewbold 9.5G Dec 19 23:26 journals-ia.20181218.json + + bnewbold@ia601101$ wc -l journals-ia.20181218.pdf-sha1.tsv + 1748645 journals-ia.20181218.pdf-sha1.tsv + |