From fb9d55bddc85c865b4e7eb4fb1259891f6f4a9be Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 30 Apr 2019 17:20:56 -0700 Subject: old fileset and webcapture example entities --- extra/demo_entities/filesets.txt | 73 +++++++++++++++++++++++++++++++++++++ extra/demo_entities/webcaptures.txt | 73 +++++++++++++++++++++++++++++++++++++ 2 files changed, 146 insertions(+) create mode 100644 extra/demo_entities/filesets.txt diff --git a/extra/demo_entities/filesets.txt b/extra/demo_entities/filesets.txt new file mode 100644 index 00000000..9a3beae3 --- /dev/null +++ b/extra/demo_entities/filesets.txt @@ -0,0 +1,73 @@ + +## Goals + +"DASH/CDL/IA/Dat importer" + => start with local dat clone w/ discovery key; releases that have DOI + => but may need to create release if datacite + => enumerate and hash all the files under 'data/' + => process metadata from cdl_dash_metadata.json + => construct fileset entity + => set extra['ark_id'] + => set extra['related_works'] = [] (?) + => or group under the work? + => add: rel=dweb url=dat://.../files/ + => add CDL... repo-bundle? + https://merritt.cdlib.org/u/ark%3A%2Fb5068%2Fd1rp49/2 + => add CDL... repo-dir? + https://merritt.cdlib.org/d/ark%3A%2Fb5068%2Fd1rp49/2/021611_H929.txt + +## Example Works + +https://dash.ucop.edu/stash/dataset/doi:10.7280/D1J37Z +"Jakobshavn Glacier Bed Elevation" +< 1MByte +doi:10.7280/D1J37Z +ark:/13030/m5rg0r8q +dat://77e94744aa5f967e6ed7e3990bfc29f141dbf2c0fff572eb1212b3bd706882f4 +NOTE: abstract was unicode-mangled for this one; I fixed by hand +https://fatcat.wiki/fileset/ho376wmdanckpp66iwfs7g22ne + +https://dash.ucop.edu/stash/dataset/doi:10.5068/D1RP49 +"Live cell interferometry cell division tracking data files" +54 MByte, couple dozen files, no directorie +doi:10.5068/D1RP49 +ark:/b5068/d1rp49 +dat://7f5f95752650ab2968ec6a0c491fe320937ab928f57bd88692b1086248ee2925 +https://fatcat.wiki/fileset/ltjp7k2nrbes3or5h4na5qgxlu + +https://dash.ucop.edu/stash/dataset/doi:10.15146/R3201J +"Data associated with Britten, Thatcher and Caro (PLOS One, 2016). "Zebras and biting flies: quantitative analysis of reflected light from zebra coats in their natural habitat."" +CC-0 +783 MByte +doi:10.15146/R3201J +ark:/13030/m53r5pzm +dat://c02c88d3989df551e203089d67b1c2a3ae36e933b229c464d78356935acedfd1 +existing fatcat work:h5cb6baxnragxlg4tamgsgpef4 release:qws4ekug5bgivkxsvsgrtwuybe +https://fatcat.wiki/fileset/vp2azlpw5zgsrjr7d3w7csej2u + +stress test: +https://dash.ucop.edu/stash/dataset/doi:10.7272/Q66Q1V54 +doi:10.7272/Q66Q1V54 +ark:/b7272/q66q1v54 +dat://f0c1cbc00720ff03c47234c737e3a62088f3ec51c5b911f5e6cc73d4571bd3c0 +16 GByte, many files, in sub-directories (for which the dat is broken) + +Unfortunately, looks like these ARKs don't result (get a tombstone, "Object in +restricted Merritt collection"): http://n2t.net/ark:/13030/m53r5pzm + +## Commands + +First: + + ./fatcat_import.py --host-url https://api.fatcat.wiki/v0 cdl-dash-dat \ + 77e94744aa5f967e6ed7e3990bfc29f141dbf2c0fff572eb1212b3bd706882f4 + +Then: + + ./fatcat_import.py --host-url https://api.fatcat.wiki/v0 cdl-dash-dat \ + --editgroup-id xl3rz6uxfrb2pgprzxictbkvxi \ + 7f5f95752650ab2968ec6a0c491fe320937ab928f57bd88692b1086248ee2925 + + [etc] + + diff --git a/extra/demo_entities/webcaptures.txt b/extra/demo_entities/webcaptures.txt index 2d86fcbb..b753b689 100644 --- a/extra/demo_entities/webcaptures.txt +++ b/extra/demo_entities/webcaptures.txt @@ -43,3 +43,76 @@ And then: ./fatcat_util.py --host-url https://api.fatcat.wiki/v0 editgroup-accept kpuel5gcgjfrzkowokq54k633q + +## Links/Works + +http://worrydream.com/ClimateChange/ + +https://joi.ito.com/weblog/2018/05/28/citing-blogs.html + => https://fatcat.wiki/release/sejvdbc4mrh6ja73r5ov64l4vi + +http://kcoyle.net/mexico.html + +http://www.dlib.org/dlib/june01/reich/06reich.html + => https://fatcat.wiki/release/z477qzrwfvg2vbx226qwo2gosy + => http://web.archive.org/web/20010712114837/http://www.dlib.org/dlib/june01/reich/06reich.html +http://www.dlib.org/dlib/november12/beaudoin/11beaudoin1.html + => https://fatcat.wiki/release/rm4afnxm2jfotbsky2ca5uqlzm +http://www.dlib.org/dlib/march08/marshall/03marshall-pt1.html + => https://fatcat.wiki/release/mjtqtuyhwfdr7j2c3l36uor7uy + +https://web.archive.org/web/20141222133249/http://www.genders.org/g58/g58_doyle.html + => https://fatcat.wiki/container/nzyvsqxghrhhppt7ruhfsvcnru (?) + => https://fatcat.wiki/container/47b5x547gvbw3pbjdpqicyne7u (?) + +https://blog.dshr.org/2014/03/the-half-empty-archive.html +https://blog.dshr.org/2018/10/brief-talk-at-internet-archive-event.html + +https://distill.pub/2017/momentum/ + => https://fatcat.wiki/release/urz24xenybawtlfaflo3yxhcoa + +http://people.csail.mit.edu/junyanz/cat/cat_papers.html + +## Goals + +"static page" script that takes extid (or fatcat id) and wayback link + x=> looks up fatcat release entity + x=> checks for existing webcapture object with same params + x=> fetch wayback base HTML, in re-write mode + x=> extract list of all embeds + x=> hit CDX server for each embed, as well as base URL + x=> create webcapture entity locally + => write out CDX snippet to local disk + x=> submit to API (controlled by flag) and print editgroup + +"add warc file" script; takes CDX snippet and webcapture id + => CDX-to-WARC locally + => push to a petabox item + => update webcapture entity with link + => print editgroup + +webrecorder workflow + => capture single page on webrecorder + => download WARC + => upload to petabox item + => generate CDX snippet + => create webcapture entity locally + => submit to API (controlled by flag) and print editgroup + +helpers: +x "submit" and "accept" util functions (for editgroups) +- web view to show submitted/recent/accepted editgroups by editor +- create entity from JSON + +other ideas: +- general "add a URL" (for files, filesets, webcaptures) helper command + +## Commands + + cat gwb_20050408060956.replay.html | hxwls -l \ + | rg -v '^a\t' \ + | rg -v '\t//archive.org/' \ + | rg '\t/web/' \ + | cut -f3 \ + | sort -u + -- cgit v1.2.3