From 6f556a2f4660c6d5d944be5346d48a451c73e61a Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 3 Apr 2020 19:42:38 -0700 Subject: pipeline note tweaks for 2020-04-03 --- notes/pipeline_commands.md | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/notes/pipeline_commands.md b/notes/pipeline_commands.md index 1f90a45..a9eddcd 100644 --- a/notes/pipeline_commands.md +++ b/notes/pipeline_commands.md @@ -16,14 +16,11 @@ Fetch and transform metadata: cat metadata/cord19.$CORDDATE.json | parallel -j10 --linebuffer --round-robin --pipe ./covid19_tool.py enrich-fatcat - | pv -l > metadata/cord19.$CORDDATE.enrich.json cat metadata/cord19.$CORDDATE.enrich.json | jq 'select(.release_id == null) | .cord19_paper' -c > metadata/cord19.$CORDDATE.missing.json -Existing fatcat ES transform: - - # in fatcat python directory, pipenv shell - cat /srv/covid19.fatcat.wiki/src/metadata/cord19.$CORDDATE.enrich.json | jq .fatcat_release -c | rg -v '^null$' | ./fatcat_transform.py elasticsearch-releases - - | pv -l > /srv/covid19.fatcat.wiki/src/metadata/cord19.$CORDDATE.fatcat_es.json - Download fulltext from wayback: - cat metadata/cord19.$CORDDATE.enrich.json | jq .fatcat_release -c | parallel -j20 --linebuffer --round-robin --pipe ./bin/deliver_file2disk.py --disk-dir fulltext_web - | pv -l > fatcat_web_20200327.log + cat metadata/cord19.$CORDDATE.enrich.json | jq .fatcat_release -c | parallel -j20 --linebuffer --round-robin --pipe ./bin/deliver_file2disk.py --disk-dir fulltext_web - | pv -l > fatcat_web_$CORDDATE.log + + cut -f1 fatcat_web_$CORDDATE.log | sort | uniq -c | sort -nr Extract text from PDFs: -- cgit v1.2.3