diff options
Diffstat (limited to 'notes')
| -rw-r--r-- | notes/pipeline_commands.md | 28 | 
1 files changed, 16 insertions, 12 deletions
diff --git a/notes/pipeline_commands.md b/notes/pipeline_commands.md index a749ad4..1f90a45 100644 --- a/notes/pipeline_commands.md +++ b/notes/pipeline_commands.md @@ -2,26 +2,28 @@  Dependencies:      sudo apt install poppler-utils -    pipenv shell      pip install requests python-magic +Context: + +    export CORDDATE="2020-03-27" +  Fetch and transform metadata:      mkdir -p metadata fulltext_web -    wget https://archive.org/download/s2-cord19-dataset/cord19.2020-03-27.csv -    mv cord19.2020-03-27.csv metadata -    ./bin/parse_cord19_csv.py metadata/cord19.2020-03-27.csv > metadata/cord19.2020-03-27.json -    cat metadata/cord19.2020-03-27.json | parallel -j10 --linebuffer --round-robin --pipe ./bin/cord19_fatcat_enrich.py - | pv -l > metadata/cord19.2020-03-27.enrich.json -    cat metadata/cord19.2020-03-27.enrich.json | jq 'select(.release_id == null) | .cord19_paper' -c > metadata/cord19.2020-03-27.missing.json +    wget https://archive.org/download/s2-cord19-dataset/cord19.$CORDDATE.csv -O metadata/cord19.$CORDDATE.csv metadata +    ./bin/parse_cord19_csv.py metadata/cord19.$CORDDATE.csv > metadata/cord19.$CORDDATE.json +    cat metadata/cord19.$CORDDATE.json | parallel -j10 --linebuffer --round-robin --pipe ./covid19_tool.py enrich-fatcat - | pv -l > metadata/cord19.$CORDDATE.enrich.json +    cat metadata/cord19.$CORDDATE.enrich.json | jq 'select(.release_id == null) | .cord19_paper' -c > metadata/cord19.$CORDDATE.missing.json  Existing fatcat ES transform:      # in fatcat python directory, pipenv shell -    cat /srv/covid19.fatcat.wiki/src/metadata/cord19.2020-03-27.enrich.json | jq .fatcat_release -c | rg -v '^null$' | ./fatcat_transform.py elasticsearch-releases - - | pv -l > /srv/covid19.fatcat.wiki/src/metadata/cord19.2020-03-27.fatcat_es.json +    cat /srv/covid19.fatcat.wiki/src/metadata/cord19.$CORDDATE.enrich.json | jq .fatcat_release -c | rg -v '^null$' | ./fatcat_transform.py elasticsearch-releases - - | pv -l > /srv/covid19.fatcat.wiki/src/metadata/cord19.$CORDDATE.fatcat_es.json  Download fulltext from wayback: -    cat metadata/cord19.2020-03-27.enrich.json | jq .fatcat_release -c | parallel -j20 --linebuffer --round-robin --pipe ./bin/deliver_file2disk.py --disk-dir fulltext_web - | pv -l > fatcat_web_20200327.log +    cat metadata/cord19.$CORDDATE.enrich.json | jq .fatcat_release -c | parallel -j20 --linebuffer --round-robin --pipe ./bin/deliver_file2disk.py --disk-dir fulltext_web - | pv -l > fatcat_web_20200327.log  Extract text from PDFs: @@ -35,6 +37,8 @@ Create thumbnails:  Fetch GROBID: +    # TODO +  Convert GROBID XML to JSON:      ls fulltext_web/pdf/ | parallel mkdir -p fulltext_web/grobid/{} @@ -42,9 +46,9 @@ Convert GROBID XML to JSON:  Create large derivatives file (including extracted fulltext): -    ./cord19_fatcat_derivatives.py metadata/cord19.2020-03-27.enrich.json --base-dir fulltext_web/ | pv -l > metadata/cord19.2020-03-27.fulltext.json +    ./covid19_tool.py enrich-derivatives metadata/cord19.$CORDDATE.enrich.json --base-dir fulltext_web/ | pv -l > metadata/cord19.$CORDDATE.fulltext.json -    cat metadata/cord19.2020-03-27.fulltext.json | jq .fulltext_status -r | sort | uniq -c | sort -nr +    cat metadata/cord19.$CORDDATE.fulltext.json | jq .fulltext_status -r | sort | uniq -c | sort -nr  ## ES Indices @@ -55,7 +59,7 @@ Create fulltext index, transform to ES schema and index:      http put :9200/covid19_fatcat_fulltext < schema/fulltext_schema.v00.json      # in fatcat_covid19, pipenv shell -    ./elastic_transform.py metadata/cord19.2020-03-27.fulltext.json | pv -l | esbulk -verbose -size 1000 -id fatcat_ident -w 8 -index covid19_fatcat_fulltext -type release +    ./covid19_tool.py transform-es metadata/cord19.$CORDDATE.fulltext.json | pv -l | esbulk -verbose -size 1000 -id fatcat_ident -w 8 -index covid19_fatcat_fulltext -type release  Create and index existing `fatcat_release` schema: @@ -63,7 +67,7 @@ Create and index existing `fatcat_release` schema:      # in fatcat python directory, pipenv shell      export LC_ALL=C.UTF-8 -    cat /srv/fatcat_covid19/src/metadata/cord19.2020-03-27.enrich.json | jq .fatcat_release -c | rg -v '^null$' | pv -l | ./fatcat_transform.py elasticsearch-releases - - | esbulk -verbose -size 1000 -id ident -w 8 -index covid19_fatcat_release -type release +    cat /srv/fatcat_covid19/src/metadata/cord19.$CORDDATE.enrich.json | jq .fatcat_release -c | rg -v '^null$' | pv -l | ./fatcat_transform.py elasticsearch-releases - - | esbulk -verbose -size 1000 -id ident -w 8 -index covid19_fatcat_release -type release  ## GROBID Processing  | 
