diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-04-03 15:15:32 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-04-03 15:15:32 -0700 |
commit | 4cbbdf33ee2a9651f79f96e4bf290d8bc721f69d (patch) | |
tree | a81bf8d2d89f6a19a3e20e4b743f3dfc4c4c8ad0 /notes | |
parent | 2bdda2dbf8204d0dd36a4b5b7460ff89bfcc3b5c (diff) | |
download | fatcat-covid19-4cbbdf33ee2a9651f79f96e4bf290d8bc721f69d.tar.gz fatcat-covid19-4cbbdf33ee2a9651f79f96e4bf290d8bc721f69d.zip |
move random files to notes/
Diffstat (limited to 'notes')
-rw-r--r-- | notes/pipeline_commands.md | 74 | ||||
-rw-r--r-- | notes/plan.mv | 53 | ||||
-rw-r--r-- | notes/rfc.md | 38 |
3 files changed, 165 insertions, 0 deletions
diff --git a/notes/pipeline_commands.md b/notes/pipeline_commands.md new file mode 100644 index 0000000..a749ad4 --- /dev/null +++ b/notes/pipeline_commands.md @@ -0,0 +1,74 @@ + +Dependencies: + + sudo apt install poppler-utils + pipenv shell + pip install requests python-magic + +Fetch and transform metadata: + + mkdir -p metadata fulltext_web + wget https://archive.org/download/s2-cord19-dataset/cord19.2020-03-27.csv + mv cord19.2020-03-27.csv metadata + ./bin/parse_cord19_csv.py metadata/cord19.2020-03-27.csv > metadata/cord19.2020-03-27.json + cat metadata/cord19.2020-03-27.json | parallel -j10 --linebuffer --round-robin --pipe ./bin/cord19_fatcat_enrich.py - | pv -l > metadata/cord19.2020-03-27.enrich.json + cat metadata/cord19.2020-03-27.enrich.json | jq 'select(.release_id == null) | .cord19_paper' -c > metadata/cord19.2020-03-27.missing.json + +Existing fatcat ES transform: + + # in fatcat python directory, pipenv shell + cat /srv/covid19.fatcat.wiki/src/metadata/cord19.2020-03-27.enrich.json | jq .fatcat_release -c | rg -v '^null$' | ./fatcat_transform.py elasticsearch-releases - - | pv -l > /srv/covid19.fatcat.wiki/src/metadata/cord19.2020-03-27.fatcat_es.json + +Download fulltext from wayback: + + cat metadata/cord19.2020-03-27.enrich.json | jq .fatcat_release -c | parallel -j20 --linebuffer --round-robin --pipe ./bin/deliver_file2disk.py --disk-dir fulltext_web - | pv -l > fatcat_web_20200327.log + +Extract text from PDFs: + + ls fulltext_web/pdf/ | parallel mkdir -p fulltext_web/pdftotext/{} + fd -I .pdf fulltext_web/pdf/ | cut -c18-60 | parallel -j10 pdftotext fulltext_web/pdf/{}.pdf fulltext_web/pdftotext/{}.txt + +Create thumbnails: + + ls fulltext_web/pdf/ | parallel mkdir -p fulltext_web/thumbnail/{} + fd -I .pdf fulltext_web/pdf/ | cut -c18-60 | parallel -j10 pdftocairo -png -singlefile -scale-to-x 400 -scale-to-y -1 fulltext_web/pdf/{}.pdf fulltext_web/thumbnail/{} + +Fetch GROBID: + +Convert GROBID XML to JSON: + + ls fulltext_web/pdf/ | parallel mkdir -p fulltext_web/grobid/{} + fd -I .xml fulltext_web/grobid/ | cut -c18-60 | parallel -j10 "bin/grobid2json.py fulltext_web/grobid/{}.xml > fulltext_web/grobid/{}.json" + +Create large derivatives file (including extracted fulltext): + + ./cord19_fatcat_derivatives.py metadata/cord19.2020-03-27.enrich.json --base-dir fulltext_web/ | pv -l > metadata/cord19.2020-03-27.fulltext.json + + cat metadata/cord19.2020-03-27.fulltext.json | jq .fulltext_status -r | sort | uniq -c | sort -nr + + +## ES Indices + +Create fulltext index, transform to ES schema and index: + + # if existing, first: http delete :9200/covid19_fatcat_fulltext + http put :9200/covid19_fatcat_fulltext < schema/fulltext_schema.v00.json + + # in fatcat_covid19, pipenv shell + ./elastic_transform.py metadata/cord19.2020-03-27.fulltext.json | pv -l | esbulk -verbose -size 1000 -id fatcat_ident -w 8 -index covid19_fatcat_fulltext -type release + +Create and index existing `fatcat_release` schema: + + http put :9200/covid19_fatcat_release < schema/release_schema_v03b.json + + # in fatcat python directory, pipenv shell + export LC_ALL=C.UTF-8 + cat /srv/fatcat_covid19/src/metadata/cord19.2020-03-27.enrich.json | jq .fatcat_release -c | rg -v '^null$' | pv -l | ./fatcat_transform.py elasticsearch-releases - - | esbulk -verbose -size 1000 -id ident -w 8 -index covid19_fatcat_release -type release + +## GROBID Processing + + zip -r fulltext_web.zip fulltext_web + + # on GROBID worker, in sandcrawler repo and pipenv + ./grobid_tool.py --grobid-host http://localhost:8070 -j 24 extract-zipfile /srv/sandcrawler/tasks/fulltext_web.zip | pv -l > /srv/sandcrawler/tasks/fulltext_web.grobid.json + diff --git a/notes/plan.mv b/notes/plan.mv new file mode 100644 index 0000000..0f97305 --- /dev/null +++ b/notes/plan.mv @@ -0,0 +1,53 @@ + +layout: +- pipenv, python3.7, flask, elasticsearch-dsl, semantic-ui +- python code/libs in sub-directory +- single-file flask with all routes, call helper routines + +prototype pipeline: +- CORD-19 dataset +- enrich script fetches fatcat metadata, outputs combined .json +- download + derive manually +- transform script (based on download) creates ES documents as JSON + +pipeline: +- .json files with basic metadata from each source + => CORD-19 + => fatcat ES queries + => manual addition +- enrich script takes all the above, does fatcat lookups, de-dupes by release ident, dumps json with tags and extra metadata + +design: +- elasticschema schema +- i18n URL schema +- single-page? multi-page? +- tags/indicators for quality + +infra: +- register dns: covid19.qa.fatcat.wiki, covid19.fatcat.wiki + +examples: +- jupyter notebook +- observable hq + +implement: +- download GROBID as well as PDFs + +topics: +- Favipiravir +- Chloroquine + +tasks/research: +- tracking down every single paper from WHO etc +- finding interesting older papers + +papers: +- imperial college paper +- WHO reports and recommendations +- "hammer and the dance" blog-post +- korean, chinese, singaporean reports +- http://subject.med.wanfangdata.com.cn/Channel/7?mark=34 + + +tools? +- vega-lite diff --git a/notes/rfc.md b/notes/rfc.md new file mode 100644 index 0000000..6a5c516 --- /dev/null +++ b/notes/rfc.md @@ -0,0 +1,38 @@ + +Research index and searchable discovery tool of papers and datasets related to +COVID-19. + +Features: +- fulltext search over papers +- direct download PDFs +- find content by search queries + lists of identifiers + +## Design + +Web interface build on elasticsearch. Guessing on the order of 100k entities. + +Batch back-end system aggregates papers of interest, fetches metadata from +fatcat, fetches fulltext+GROBID, indexes into elasticsearch. Run periodically +(eg, daily, hourly) + +Some light quality tooling to find bad metadata; do cleanups in fatcat itself. + + +## Thoughts / Brainstorm + +Tagging? Eg, by type of flu, why paper included + +Clearly indicate publication status (pre-prints). + +Auto-translation to multiple languages. Translation/i18n of user interface. + +Dashboards/graphs of stats? + +Faceted search. + + +## Also + +Find historical papers of interest, eg the Spanish Flu, feature in blog posts. + +Manually add interesting/valuable greylit like notable blog posts, WHO reports. |