From 97b293dbed0b699602d88889224677b6b4e8d7e5 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Tue, 11 Jan 2022 10:46:20 +0100 Subject: notes: refcat update --- notes/2022_01_10_refcat_update.md | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 notes/2022_01_10_refcat_update.md diff --git a/notes/2022_01_10_refcat_update.md b/notes/2022_01_10_refcat_update.md new file mode 100644 index 0000000..795a9d4 --- /dev/null +++ b/notes/2022_01_10_refcat_update.md @@ -0,0 +1,15 @@ +# Refcat update + +* new refs export, about 10% more (2.7B) +* new fatcat export + +New wikipedia extraction: + +``` +martin@ia601101:/magna/data/wikipedia_citations_2020-07-14 $ LC_ALL=C grep ID_list minimal_dataset.json | grep -c DOI +1442189 + +$ jq -rc '.refs[] | select(.ID_list != null) | {"URL": .URL, "Title": .title, "ID_list": .ID_list}' enwiki-20211201-pages-articles.citations.json | pv -l > minimal.json +$ grep -c DOI minimal.json +1932578 +``` -- cgit v1.2.3