From 7154c788914e9eec3dcbc7da69d634eec5221da0 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 1 Oct 2020 18:19:51 -0700 Subject: notes on exporting refs corpus from intermediate files --- notes/202009_refs_dump.txt | 156 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 156 insertions(+) create mode 100644 notes/202009_refs_dump.txt (limited to 'notes') diff --git a/notes/202009_refs_dump.txt b/notes/202009_refs_dump.txt new file mode 100644 index 0000000..49164b6 --- /dev/null +++ b/notes/202009_refs_dump.txt @@ -0,0 +1,156 @@ + +where should code for this live? fatcat-scholar, sandcrawler, fatcat? +preferably fatcat repo I guess. + +today (2020-09-04), 29.7 million releases have some refs in fatcat, and an +additional 20 million have fulltext in fatcat (50 million total). there are +969,541,971 total references, so expecting something on the order of 1.6 +billion references output. + +only about 3.75 million references coming from wikipedia (en) with a persistent +identifier. + +first version of tool cruises in single thread at 330 docs/sec, or about 1 mil/hour + + zcat /grande/snapshots/fatcat_scholar_work_fulltext.split_00.json.gz | head -n1000000 | pv -l | python -m fatcat_scholar.transform run_refs > /bigger/scholar/fatcat_scholar_work_fulltext.split_00.1m.refs.json + 1M 0:53:13 [ 313 /s] + + wc -l /bigger/scholar/fatcat_scholar_work_fulltext.split_00.1m.refs.json + => 9,242,758 + + cat fatcat_scholar_work_fulltext.split_00.1m.refs.json | jq .release_ident -r | sort -u | wc -l + => 282,568 + + cat fatcat_scholar_work_fulltext.split_00.1m.refs.json | jq .ref_source -r | sort | uniq -c | sort -nr + 4760872 crossref + 2847014 grobid + 735459 pubmed + 683909 datacite + 215504 fatcat (probably GROBID, jstage, etc) + + cat fatcat_scholar_work_fulltext.split_00.1m.refs.json | jq .biblio.url -r | rg -v '^null$' | rg wikipedia.org > wikpedia_urls.tsv + cat fatcat_scholar_work_fulltext.split_00.1m.refs.json | jq .biblio.url -r | rg -v '^null$' | rg wikipedia.org | wc -l + => 523 + + cat fatcat_scholar_work_fulltext.split_00.1m.refs.json | jq .biblio.url -r | rg -v '^null$' | rg archive.org > archive_urls.tsv + cat fatcat_scholar_work_fulltext.split_00.1m.refs.json | jq .biblio.url -r | rg -v '^null$' | rg archive.org | wc -l + => 122 + + cat fatcat_scholar_work_fulltext.split_00.1m.refs.json | jq .biblio.pmid -r | rg -v '^null$' | wc -l + => 500036 + + cat fatcat_scholar_work_fulltext.split_00.1m.refs.json | jq .biblio.doi -r | rg -v '^null$' | wc -l + => 3636175 + + cat fatcat_scholar_work_fulltext.split_00.1m.refs.json | jq .biblio.url -r | rg -v '^null$' | rg -v doi.org/ | wc -l + + cat fatcat_scholar_work_fulltext.split_00.1m.refs.json | jq .biblio.url -r | rg -v '^null$' | cut -f3 -d/ | sort | uniq -c | sort -nr | head -n50 + + 35233 doi.org + 26263 dx.doi.org + 1539 www.ncbi.nlm.nih.gov + 1518 + 843 arxiv.org + 821 www.who.int + 670 www.scielo.br + 642 goo.gl + 463 ec.europa.eu + 449 bit.ly + 434 www.cdc.gov + 431 www.jstor.org + 425 www.sciencedirect.com + 372 www.planalto.gov.br + 356 en.wikipedia.org + 331 www.w3.org + 308 creativecommons.org + 306 www.youtube.com + 295 www.nytimes.com + 278 ssrn.com + [...] + +TODO: +x year/date of the *citing* document +x 'unstructured' in biblio +x contrib_raw names Optional (to save on key storage space) +x basic URL cleanup +x GROBID *and* fatcat for all lines (?) +x more GROBID refs fields (grobid2json) + biblStruct + analytic + x arXiv:nucl-th/0007068 + x 10.5354/0719-3769.1979.16458 + x PMC3112331 + x 16330524 + ISBN 0-674- 21298-3 + x + imprint + x Cambridge University Press + x + x 18346083 + arXiv preprint + x +- debug pubmed refs +- title seems to not come through from fatcat + +resources: +- https://git.archive.org/webgroup/fatcat/-/blob/bnewbold-citation-graph-brainstorm/proposals/202008_bulk_citation_graph.md +- https://docs.citationstyles.org/en/1.0.1/specification.html#appendix-iv-variables +- https://guide.fatcat.wiki/entity_release.html +- https://analytics.wikimedia.org/published/datasets/archive/public-datasets/all/mwrefs/mwcites-20180301/ + +open questions: +- how many citations? how large is this corpus on-disk? + 1mil => 2.6gb (uncompressed) + 150mil => 390gb (uncompressed) +- what fraction... + have an external identifier (quick match) + look like junk + have a URL +- how many references to wikipedia? assuming via URL +- how many references to IA services + => archive.org, web.archive.org, archive-it, openlibrary.org, etc + => top resources + +---------- + +running larger batch: + + zcat /grande/snapshots/fatcat_scholar_work_fulltext.split_01.json.gz | pv -l | parallel -j8 --linebuffer --round-robin --pipe python -m fatcat_scholar.transform run_refs > /bigger/scholar/fatcat_scholar_work_fulltext.split_01.refs.json + => 24.7M 5:33:27 [1.23k/s] + + 123G Sep 14 12:56 fatcat_scholar_work_fulltext.split_01.refs.json + + pigz fatcat_scholar_work_fulltext.split_01.refs.json + + du -sh fatcat_scholar_work_fulltext.split_01.refs.json.gz + 24G fatcat_scholar_work_fulltext.split_01.refs.json.gz + + zcat fatcat_scholar_work_fulltext.split_01.refs.json.gz | wc -l + 285,551,233 + +Expecting a bit below 2 billion references; though actually because of +duplication. + +This really blows up in size, presumably because things like release+work +idents don't compress well and are duplicated for each reference. JSON overhead +should almost entirely compress away. + +Let's do the rest of these, so we can upload as a corpus (estimate 168 GByte +compressed). + +zcat /grande/snapshots/fatcat_scholar_work_fulltext.split_00.json.gz | pv -l | parallel -j8 --linebuffer --round-robin --pipe python -m fatcat_scholar.transform run_refs | gzip > /bigger/scholar/fatcat_scholar_work_fulltext.split_00.refs.json +zcat /grande/snapshots/fatcat_scholar_work_fulltext.split_02.json.gz | pv -l | parallel -j8 --linebuffer --round-robin --pipe python -m fatcat_scholar.transform run_refs | gzip > /bigger/scholar/fatcat_scholar_work_fulltext.split_02.refs.json +zcat /grande/snapshots/fatcat_scholar_work_fulltext.split_03.json.gz | pv -l | parallel -j8 --linebuffer --round-robin --pipe python -m fatcat_scholar.transform run_refs | gzip > /bigger/scholar/fatcat_scholar_work_fulltext.split_03.refs.json +zcat /grande/snapshots/fatcat_scholar_work_fulltext.split_04.json.gz | pv -l | parallel -j8 --linebuffer --round-robin --pipe python -m fatcat_scholar.transform run_refs | gzip > /bigger/scholar/fatcat_scholar_work_fulltext.split_04.refs.json +zcat /grande/snapshots/fatcat_scholar_work_fulltext.split_05.json.gz | pv -l | parallel -j8 --linebuffer --round-robin --pipe python -m fatcat_scholar.transform run_refs | gzip > /bigger/scholar/fatcat_scholar_work_fulltext.split_05.refs.json +zcat /grande/snapshots/fatcat_scholar_work_fulltext.split_06.json.gz | pv -l | parallel -j8 --linebuffer --round-robin --pipe python -m fatcat_scholar.transform run_refs | gzip > /bigger/scholar/fatcat_scholar_work_fulltext.split_06.refs.json + +Something went wrong part-way through split_04: + + 5.55M 1:12:30 [1.12k/s] + [...] + json.decoder.JSONDecodeError: Expecting value: line 1 column 853 (char 852) + +Disk corruption or something? This parsed fine with a very similar command +recently. Hrm. Re-ran again and was successful. + -- cgit v1.2.3