diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-04-03 19:42:38 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-04-03 19:42:38 -0700 |
commit | 6f556a2f4660c6d5d944be5346d48a451c73e61a (patch) | |
tree | 72678b2a7588ce7e1d111a9b7c761acf4e70c280 | |
parent | f5f1b391442b1a8c94821f61129da7ffdd18ef6d (diff) | |
download | fatcat-covid19-6f556a2f4660c6d5d944be5346d48a451c73e61a.tar.gz fatcat-covid19-6f556a2f4660c6d5d944be5346d48a451c73e61a.zip |
pipeline note tweaks for 2020-04-03
-rw-r--r-- | notes/pipeline_commands.md | 9 |
1 files changed, 3 insertions, 6 deletions
diff --git a/notes/pipeline_commands.md b/notes/pipeline_commands.md index 1f90a45..a9eddcd 100644 --- a/notes/pipeline_commands.md +++ b/notes/pipeline_commands.md @@ -16,14 +16,11 @@ Fetch and transform metadata: cat metadata/cord19.$CORDDATE.json | parallel -j10 --linebuffer --round-robin --pipe ./covid19_tool.py enrich-fatcat - | pv -l > metadata/cord19.$CORDDATE.enrich.json cat metadata/cord19.$CORDDATE.enrich.json | jq 'select(.release_id == null) | .cord19_paper' -c > metadata/cord19.$CORDDATE.missing.json -Existing fatcat ES transform: - - # in fatcat python directory, pipenv shell - cat /srv/covid19.fatcat.wiki/src/metadata/cord19.$CORDDATE.enrich.json | jq .fatcat_release -c | rg -v '^null$' | ./fatcat_transform.py elasticsearch-releases - - | pv -l > /srv/covid19.fatcat.wiki/src/metadata/cord19.$CORDDATE.fatcat_es.json - Download fulltext from wayback: - cat metadata/cord19.$CORDDATE.enrich.json | jq .fatcat_release -c | parallel -j20 --linebuffer --round-robin --pipe ./bin/deliver_file2disk.py --disk-dir fulltext_web - | pv -l > fatcat_web_20200327.log + cat metadata/cord19.$CORDDATE.enrich.json | jq .fatcat_release -c | parallel -j20 --linebuffer --round-robin --pipe ./bin/deliver_file2disk.py --disk-dir fulltext_web - | pv -l > fatcat_web_$CORDDATE.log + + cut -f1 fatcat_web_$CORDDATE.log | sort | uniq -c | sort -nr Extract text from PDFs: |