From 3599ebce9941fabe06640d1f664cb4949eddd74d Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 3 Apr 2020 18:12:41 -0700 Subject: update pipeline commands --- notes/pipeline_commands.md | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) (limited to 'notes') diff --git a/notes/pipeline_commands.md b/notes/pipeline_commands.md index a749ad4..1f90a45 100644 --- a/notes/pipeline_commands.md +++ b/notes/pipeline_commands.md @@ -2,26 +2,28 @@ Dependencies: sudo apt install poppler-utils - pipenv shell pip install requests python-magic +Context: + + export CORDDATE="2020-03-27" + Fetch and transform metadata: mkdir -p metadata fulltext_web - wget https://archive.org/download/s2-cord19-dataset/cord19.2020-03-27.csv - mv cord19.2020-03-27.csv metadata - ./bin/parse_cord19_csv.py metadata/cord19.2020-03-27.csv > metadata/cord19.2020-03-27.json - cat metadata/cord19.2020-03-27.json | parallel -j10 --linebuffer --round-robin --pipe ./bin/cord19_fatcat_enrich.py - | pv -l > metadata/cord19.2020-03-27.enrich.json - cat metadata/cord19.2020-03-27.enrich.json | jq 'select(.release_id == null) | .cord19_paper' -c > metadata/cord19.2020-03-27.missing.json + wget https://archive.org/download/s2-cord19-dataset/cord19.$CORDDATE.csv -O metadata/cord19.$CORDDATE.csv metadata + ./bin/parse_cord19_csv.py metadata/cord19.$CORDDATE.csv > metadata/cord19.$CORDDATE.json + cat metadata/cord19.$CORDDATE.json | parallel -j10 --linebuffer --round-robin --pipe ./covid19_tool.py enrich-fatcat - | pv -l > metadata/cord19.$CORDDATE.enrich.json + cat metadata/cord19.$CORDDATE.enrich.json | jq 'select(.release_id == null) | .cord19_paper' -c > metadata/cord19.$CORDDATE.missing.json Existing fatcat ES transform: # in fatcat python directory, pipenv shell - cat /srv/covid19.fatcat.wiki/src/metadata/cord19.2020-03-27.enrich.json | jq .fatcat_release -c | rg -v '^null$' | ./fatcat_transform.py elasticsearch-releases - - | pv -l > /srv/covid19.fatcat.wiki/src/metadata/cord19.2020-03-27.fatcat_es.json + cat /srv/covid19.fatcat.wiki/src/metadata/cord19.$CORDDATE.enrich.json | jq .fatcat_release -c | rg -v '^null$' | ./fatcat_transform.py elasticsearch-releases - - | pv -l > /srv/covid19.fatcat.wiki/src/metadata/cord19.$CORDDATE.fatcat_es.json Download fulltext from wayback: - cat metadata/cord19.2020-03-27.enrich.json | jq .fatcat_release -c | parallel -j20 --linebuffer --round-robin --pipe ./bin/deliver_file2disk.py --disk-dir fulltext_web - | pv -l > fatcat_web_20200327.log + cat metadata/cord19.$CORDDATE.enrich.json | jq .fatcat_release -c | parallel -j20 --linebuffer --round-robin --pipe ./bin/deliver_file2disk.py --disk-dir fulltext_web - | pv -l > fatcat_web_20200327.log Extract text from PDFs: @@ -35,6 +37,8 @@ Create thumbnails: Fetch GROBID: + # TODO + Convert GROBID XML to JSON: ls fulltext_web/pdf/ | parallel mkdir -p fulltext_web/grobid/{} @@ -42,9 +46,9 @@ Convert GROBID XML to JSON: Create large derivatives file (including extracted fulltext): - ./cord19_fatcat_derivatives.py metadata/cord19.2020-03-27.enrich.json --base-dir fulltext_web/ | pv -l > metadata/cord19.2020-03-27.fulltext.json + ./covid19_tool.py enrich-derivatives metadata/cord19.$CORDDATE.enrich.json --base-dir fulltext_web/ | pv -l > metadata/cord19.$CORDDATE.fulltext.json - cat metadata/cord19.2020-03-27.fulltext.json | jq .fulltext_status -r | sort | uniq -c | sort -nr + cat metadata/cord19.$CORDDATE.fulltext.json | jq .fulltext_status -r | sort | uniq -c | sort -nr ## ES Indices @@ -55,7 +59,7 @@ Create fulltext index, transform to ES schema and index: http put :9200/covid19_fatcat_fulltext < schema/fulltext_schema.v00.json # in fatcat_covid19, pipenv shell - ./elastic_transform.py metadata/cord19.2020-03-27.fulltext.json | pv -l | esbulk -verbose -size 1000 -id fatcat_ident -w 8 -index covid19_fatcat_fulltext -type release + ./covid19_tool.py transform-es metadata/cord19.$CORDDATE.fulltext.json | pv -l | esbulk -verbose -size 1000 -id fatcat_ident -w 8 -index covid19_fatcat_fulltext -type release Create and index existing `fatcat_release` schema: @@ -63,7 +67,7 @@ Create and index existing `fatcat_release` schema: # in fatcat python directory, pipenv shell export LC_ALL=C.UTF-8 - cat /srv/fatcat_covid19/src/metadata/cord19.2020-03-27.enrich.json | jq .fatcat_release -c | rg -v '^null$' | pv -l | ./fatcat_transform.py elasticsearch-releases - - | esbulk -verbose -size 1000 -id ident -w 8 -index covid19_fatcat_release -type release + cat /srv/fatcat_covid19/src/metadata/cord19.$CORDDATE.enrich.json | jq .fatcat_release -c | rg -v '^null$' | pv -l | ./fatcat_transform.py elasticsearch-releases - - | esbulk -verbose -size 1000 -id ident -w 8 -index covid19_fatcat_release -type release ## GROBID Processing -- cgit v1.2.3