From 0cf608debcd672f9a3c54cb8d4ac1caf686ce2e3 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 30 Mar 2020 09:48:39 -0700 Subject: missing: patching metadata for missing fatcat records --- notes/missing_2020-03-20.md | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'notes') diff --git a/notes/missing_2020-03-20.md b/notes/missing_2020-03-20.md index fa2705b..2576a8e 100644 --- a/notes/missing_2020-03-20.md +++ b/notes/missing_2020-03-20.md @@ -280,3 +280,26 @@ Interesting sites to crawl or translate: => commerical/national holder of 40+ million papers => indexed by EBSCO +## Fetching Metata + + cat metadata/cord19.2020-03-27.missing.json | jq 'select(.doi != "") | .doi' -r | sort -u > missing_doi.tsv + cat metadata/cord19.2020-03-27.missing.json | jq 'select(.pubmed_id != "") | .pubmed_id' -r | sort -u > missing_pmid.tsv + cat metadata/cord19.2020-03-27.missing.json | jq 'select(.pmcid != "") | .pmcid' -r | sort -u > missing_pmcid.tsv + + cat missing_doi.tsv | parallel -j4 'http --headers head "https://doi.org/{}" | head -n1 | awk "{print \"{}\t\" \$2}"' > missing_doi_status.tsv + + cat missing_doi_status.tsv | rg '404$' | cut -f1 > unregistered_doi.tsv + + cat missing_doi_status.tsv | rg '302$' | cut -f1 | parallel http --json get "https://api.crossref.org/v1/works/http://dx.doi.org/{}" | jq .message -c | pv -l > missing_doi_crossref.json + + mkdir -p pubmed + cat missing_pmcid.tsv | parallel -j1 'http get "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id={}&rettype=pubmed" > pubmed/{}.xml' + cat missing_pmid.tsv | parallel -j1 'http get "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id={}&rettype=pubmed" > pubmed/{}.xml' + + cat pubmed/*.xml | rg -v '^<\?xml version' | rg -v '^' | rg -v '^' > pubmed_combined.xml + + # Edit file manually to add `` and `` wrapper. + + # in prod: + ./fatcat_import.py pubmed --do-updates /tmp/pubmed_combined.xml /srv/fatcat/datasets/ISSN-to-ISSN-L.txt + -- cgit v1.2.3