From 4cbbdf33ee2a9651f79f96e4bf290d8bc721f69d Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 3 Apr 2020 15:15:32 -0700 Subject: move random files to notes/ --- commands.md | 74 ---------------------------------------------- notes/pipeline_commands.md | 74 ++++++++++++++++++++++++++++++++++++++++++++++ notes/plan.mv | 53 +++++++++++++++++++++++++++++++++ notes/rfc.md | 38 ++++++++++++++++++++++++ plan.txt | 53 --------------------------------- rfc.md | 38 ------------------------ 6 files changed, 165 insertions(+), 165 deletions(-) delete mode 100644 commands.md create mode 100644 notes/pipeline_commands.md create mode 100644 notes/plan.mv create mode 100644 notes/rfc.md delete mode 100644 plan.txt delete mode 100644 rfc.md diff --git a/commands.md b/commands.md deleted file mode 100644 index a749ad4..0000000 --- a/commands.md +++ /dev/null @@ -1,74 +0,0 @@ - -Dependencies: - - sudo apt install poppler-utils - pipenv shell - pip install requests python-magic - -Fetch and transform metadata: - - mkdir -p metadata fulltext_web - wget https://archive.org/download/s2-cord19-dataset/cord19.2020-03-27.csv - mv cord19.2020-03-27.csv metadata - ./bin/parse_cord19_csv.py metadata/cord19.2020-03-27.csv > metadata/cord19.2020-03-27.json - cat metadata/cord19.2020-03-27.json | parallel -j10 --linebuffer --round-robin --pipe ./bin/cord19_fatcat_enrich.py - | pv -l > metadata/cord19.2020-03-27.enrich.json - cat metadata/cord19.2020-03-27.enrich.json | jq 'select(.release_id == null) | .cord19_paper' -c > metadata/cord19.2020-03-27.missing.json - -Existing fatcat ES transform: - - # in fatcat python directory, pipenv shell - cat /srv/covid19.fatcat.wiki/src/metadata/cord19.2020-03-27.enrich.json | jq .fatcat_release -c | rg -v '^null$' | ./fatcat_transform.py elasticsearch-releases - - | pv -l > /srv/covid19.fatcat.wiki/src/metadata/cord19.2020-03-27.fatcat_es.json - -Download fulltext from wayback: - - cat metadata/cord19.2020-03-27.enrich.json | jq .fatcat_release -c | parallel -j20 --linebuffer --round-robin --pipe ./bin/deliver_file2disk.py --disk-dir fulltext_web - | pv -l > fatcat_web_20200327.log - -Extract text from PDFs: - - ls fulltext_web/pdf/ | parallel mkdir -p fulltext_web/pdftotext/{} - fd -I .pdf fulltext_web/pdf/ | cut -c18-60 | parallel -j10 pdftotext fulltext_web/pdf/{}.pdf fulltext_web/pdftotext/{}.txt - -Create thumbnails: - - ls fulltext_web/pdf/ | parallel mkdir -p fulltext_web/thumbnail/{} - fd -I .pdf fulltext_web/pdf/ | cut -c18-60 | parallel -j10 pdftocairo -png -singlefile -scale-to-x 400 -scale-to-y -1 fulltext_web/pdf/{}.pdf fulltext_web/thumbnail/{} - -Fetch GROBID: - -Convert GROBID XML to JSON: - - ls fulltext_web/pdf/ | parallel mkdir -p fulltext_web/grobid/{} - fd -I .xml fulltext_web/grobid/ | cut -c18-60 | parallel -j10 "bin/grobid2json.py fulltext_web/grobid/{}.xml > fulltext_web/grobid/{}.json" - -Create large derivatives file (including extracted fulltext): - - ./cord19_fatcat_derivatives.py metadata/cord19.2020-03-27.enrich.json --base-dir fulltext_web/ | pv -l > metadata/cord19.2020-03-27.fulltext.json - - cat metadata/cord19.2020-03-27.fulltext.json | jq .fulltext_status -r | sort | uniq -c | sort -nr - - -## ES Indices - -Create fulltext index, transform to ES schema and index: - - # if existing, first: http delete :9200/covid19_fatcat_fulltext - http put :9200/covid19_fatcat_fulltext < schema/fulltext_schema.v00.json - - # in fatcat_covid19, pipenv shell - ./elastic_transform.py metadata/cord19.2020-03-27.fulltext.json | pv -l | esbulk -verbose -size 1000 -id fatcat_ident -w 8 -index covid19_fatcat_fulltext -type release - -Create and index existing `fatcat_release` schema: - - http put :9200/covid19_fatcat_release < schema/release_schema_v03b.json - - # in fatcat python directory, pipenv shell - export LC_ALL=C.UTF-8 - cat /srv/fatcat_covid19/src/metadata/cord19.2020-03-27.enrich.json | jq .fatcat_release -c | rg -v '^null$' | pv -l | ./fatcat_transform.py elasticsearch-releases - - | esbulk -verbose -size 1000 -id ident -w 8 -index covid19_fatcat_release -type release - -## GROBID Processing - - zip -r fulltext_web.zip fulltext_web - - # on GROBID worker, in sandcrawler repo and pipenv - ./grobid_tool.py --grobid-host http://localhost:8070 -j 24 extract-zipfile /srv/sandcrawler/tasks/fulltext_web.zip | pv -l > /srv/sandcrawler/tasks/fulltext_web.grobid.json - diff --git a/notes/pipeline_commands.md b/notes/pipeline_commands.md new file mode 100644 index 0000000..a749ad4 --- /dev/null +++ b/notes/pipeline_commands.md @@ -0,0 +1,74 @@ + +Dependencies: + + sudo apt install poppler-utils + pipenv shell + pip install requests python-magic + +Fetch and transform metadata: + + mkdir -p metadata fulltext_web + wget https://archive.org/download/s2-cord19-dataset/cord19.2020-03-27.csv + mv cord19.2020-03-27.csv metadata + ./bin/parse_cord19_csv.py metadata/cord19.2020-03-27.csv > metadata/cord19.2020-03-27.json + cat metadata/cord19.2020-03-27.json | parallel -j10 --linebuffer --round-robin --pipe ./bin/cord19_fatcat_enrich.py - | pv -l > metadata/cord19.2020-03-27.enrich.json + cat metadata/cord19.2020-03-27.enrich.json | jq 'select(.release_id == null) | .cord19_paper' -c > metadata/cord19.2020-03-27.missing.json + +Existing fatcat ES transform: + + # in fatcat python directory, pipenv shell + cat /srv/covid19.fatcat.wiki/src/metadata/cord19.2020-03-27.enrich.json | jq .fatcat_release -c | rg -v '^null$' | ./fatcat_transform.py elasticsearch-releases - - | pv -l > /srv/covid19.fatcat.wiki/src/metadata/cord19.2020-03-27.fatcat_es.json + +Download fulltext from wayback: + + cat metadata/cord19.2020-03-27.enrich.json | jq .fatcat_release -c | parallel -j20 --linebuffer --round-robin --pipe ./bin/deliver_file2disk.py --disk-dir fulltext_web - | pv -l > fatcat_web_20200327.log + +Extract text from PDFs: + + ls fulltext_web/pdf/ | parallel mkdir -p fulltext_web/pdftotext/{} + fd -I .pdf fulltext_web/pdf/ | cut -c18-60 | parallel -j10 pdftotext fulltext_web/pdf/{}.pdf fulltext_web/pdftotext/{}.txt + +Create thumbnails: + + ls fulltext_web/pdf/ | parallel mkdir -p fulltext_web/thumbnail/{} + fd -I .pdf fulltext_web/pdf/ | cut -c18-60 | parallel -j10 pdftocairo -png -singlefile -scale-to-x 400 -scale-to-y -1 fulltext_web/pdf/{}.pdf fulltext_web/thumbnail/{} + +Fetch GROBID: + +Convert GROBID XML to JSON: + + ls fulltext_web/pdf/ | parallel mkdir -p fulltext_web/grobid/{} + fd -I .xml fulltext_web/grobid/ | cut -c18-60 | parallel -j10 "bin/grobid2json.py fulltext_web/grobid/{}.xml > fulltext_web/grobid/{}.json" + +Create large derivatives file (including extracted fulltext): + + ./cord19_fatcat_derivatives.py metadata/cord19.2020-03-27.enrich.json --base-dir fulltext_web/ | pv -l > metadata/cord19.2020-03-27.fulltext.json + + cat metadata/cord19.2020-03-27.fulltext.json | jq .fulltext_status -r | sort | uniq -c | sort -nr + + +## ES Indices + +Create fulltext index, transform to ES schema and index: + + # if existing, first: http delete :9200/covid19_fatcat_fulltext + http put :9200/covid19_fatcat_fulltext < schema/fulltext_schema.v00.json + + # in fatcat_covid19, pipenv shell + ./elastic_transform.py metadata/cord19.2020-03-27.fulltext.json | pv -l | esbulk -verbose -size 1000 -id fatcat_ident -w 8 -index covid19_fatcat_fulltext -type release + +Create and index existing `fatcat_release` schema: + + http put :9200/covid19_fatcat_release < schema/release_schema_v03b.json + + # in fatcat python directory, pipenv shell + export LC_ALL=C.UTF-8 + cat /srv/fatcat_covid19/src/metadata/cord19.2020-03-27.enrich.json | jq .fatcat_release -c | rg -v '^null$' | pv -l | ./fatcat_transform.py elasticsearch-releases - - | esbulk -verbose -size 1000 -id ident -w 8 -index covid19_fatcat_release -type release + +## GROBID Processing + + zip -r fulltext_web.zip fulltext_web + + # on GROBID worker, in sandcrawler repo and pipenv + ./grobid_tool.py --grobid-host http://localhost:8070 -j 24 extract-zipfile /srv/sandcrawler/tasks/fulltext_web.zip | pv -l > /srv/sandcrawler/tasks/fulltext_web.grobid.json + diff --git a/notes/plan.mv b/notes/plan.mv new file mode 100644 index 0000000..0f97305 --- /dev/null +++ b/notes/plan.mv @@ -0,0 +1,53 @@ + +layout: +- pipenv, python3.7, flask, elasticsearch-dsl, semantic-ui +- python code/libs in sub-directory +- single-file flask with all routes, call helper routines + +prototype pipeline: +- CORD-19 dataset +- enrich script fetches fatcat metadata, outputs combined .json +- download + derive manually +- transform script (based on download) creates ES documents as JSON + +pipeline: +- .json files with basic metadata from each source + => CORD-19 + => fatcat ES queries + => manual addition +- enrich script takes all the above, does fatcat lookups, de-dupes by release ident, dumps json with tags and extra metadata + +design: +- elasticschema schema +- i18n URL schema +- single-page? multi-page? +- tags/indicators for quality + +infra: +- register dns: covid19.qa.fatcat.wiki, covid19.fatcat.wiki + +examples: +- jupyter notebook +- observable hq + +implement: +- download GROBID as well as PDFs + +topics: +- Favipiravir +- Chloroquine + +tasks/research: +- tracking down every single paper from WHO etc +- finding interesting older papers + +papers: +- imperial college paper +- WHO reports and recommendations +- "hammer and the dance" blog-post +- korean, chinese, singaporean reports +- http://subject.med.wanfangdata.com.cn/Channel/7?mark=34 + + +tools? +- vega-lite diff --git a/notes/rfc.md b/notes/rfc.md new file mode 100644 index 0000000..6a5c516 --- /dev/null +++ b/notes/rfc.md @@ -0,0 +1,38 @@ + +Research index and searchable discovery tool of papers and datasets related to +COVID-19. + +Features: +- fulltext search over papers +- direct download PDFs +- find content by search queries + lists of identifiers + +## Design + +Web interface build on elasticsearch. Guessing on the order of 100k entities. + +Batch back-end system aggregates papers of interest, fetches metadata from +fatcat, fetches fulltext+GROBID, indexes into elasticsearch. Run periodically +(eg, daily, hourly) + +Some light quality tooling to find bad metadata; do cleanups in fatcat itself. + + +## Thoughts / Brainstorm + +Tagging? Eg, by type of flu, why paper included + +Clearly indicate publication status (pre-prints). + +Auto-translation to multiple languages. Translation/i18n of user interface. + +Dashboards/graphs of stats? + +Faceted search. + + +## Also + +Find historical papers of interest, eg the Spanish Flu, feature in blog posts. + +Manually add interesting/valuable greylit like notable blog posts, WHO reports. diff --git a/plan.txt b/plan.txt deleted file mode 100644 index 0f97305..0000000 --- a/plan.txt +++ /dev/null @@ -1,53 +0,0 @@ - -layout: -- pipenv, python3.7, flask, elasticsearch-dsl, semantic-ui -- python code/libs in sub-directory -- single-file flask with all routes, call helper routines - -prototype pipeline: -- CORD-19 dataset -- enrich script fetches fatcat metadata, outputs combined .json -- download + derive manually -- transform script (based on download) creates ES documents as JSON - -pipeline: -- .json files with basic metadata from each source - => CORD-19 - => fatcat ES queries - => manual addition -- enrich script takes all the above, does fatcat lookups, de-dupes by release ident, dumps json with tags and extra metadata - -design: -- elasticschema schema -- i18n URL schema -- single-page? multi-page? -- tags/indicators for quality - -infra: -- register dns: covid19.qa.fatcat.wiki, covid19.fatcat.wiki - -examples: -- jupyter notebook -- observable hq - -implement: -- download GROBID as well as PDFs - -topics: -- Favipiravir -- Chloroquine - -tasks/research: -- tracking down every single paper from WHO etc -- finding interesting older papers - -papers: -- imperial college paper -- WHO reports and recommendations -- "hammer and the dance" blog-post -- korean, chinese, singaporean reports -- http://subject.med.wanfangdata.com.cn/Channel/7?mark=34 - - -tools? -- vega-lite diff --git a/rfc.md b/rfc.md deleted file mode 100644 index 6a5c516..0000000 --- a/rfc.md +++ /dev/null @@ -1,38 +0,0 @@ - -Research index and searchable discovery tool of papers and datasets related to -COVID-19. - -Features: -- fulltext search over papers -- direct download PDFs -- find content by search queries + lists of identifiers - -## Design - -Web interface build on elasticsearch. Guessing on the order of 100k entities. - -Batch back-end system aggregates papers of interest, fetches metadata from -fatcat, fetches fulltext+GROBID, indexes into elasticsearch. Run periodically -(eg, daily, hourly) - -Some light quality tooling to find bad metadata; do cleanups in fatcat itself. - - -## Thoughts / Brainstorm - -Tagging? Eg, by type of flu, why paper included - -Clearly indicate publication status (pre-prints). - -Auto-translation to multiple languages. Translation/i18n of user interface. - -Dashboards/graphs of stats? - -Faceted search. - - -## Also - -Find historical papers of interest, eg the Spanish Flu, feature in blog posts. - -Manually add interesting/valuable greylit like notable blog posts, WHO reports. -- cgit v1.2.3