diff options
Diffstat (limited to 'notes')
| -rw-r--r-- | notes/missing_2020-03-20.md | 23 | 
1 files changed, 23 insertions, 0 deletions
| diff --git a/notes/missing_2020-03-20.md b/notes/missing_2020-03-20.md index fa2705b..2576a8e 100644 --- a/notes/missing_2020-03-20.md +++ b/notes/missing_2020-03-20.md @@ -280,3 +280,26 @@ Interesting sites to crawl or translate:          => commerical/national holder of 40+ million papers          => indexed by EBSCO +## Fetching Metata + +    cat metadata/cord19.2020-03-27.missing.json | jq 'select(.doi != "") | .doi' -r | sort -u > missing_doi.tsv +    cat metadata/cord19.2020-03-27.missing.json | jq 'select(.pubmed_id != "") | .pubmed_id' -r | sort -u > missing_pmid.tsv +    cat metadata/cord19.2020-03-27.missing.json | jq 'select(.pmcid != "") | .pmcid' -r | sort -u > missing_pmcid.tsv + +    cat missing_doi.tsv | parallel -j4 'http --headers head "https://doi.org/{}" | head -n1 | awk "{print \"{}\t\" \$2}"' > missing_doi_status.tsv + +    cat missing_doi_status.tsv | rg '404$' | cut -f1 > unregistered_doi.tsv + +    cat missing_doi_status.tsv | rg '302$' | cut -f1 | parallel http --json get "https://api.crossref.org/v1/works/http://dx.doi.org/{}" | jq .message -c | pv -l > missing_doi_crossref.json + +    mkdir -p pubmed +    cat missing_pmcid.tsv | parallel -j1 'http get "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id={}&rettype=pubmed" > pubmed/{}.xml' +    cat missing_pmid.tsv | parallel -j1 'http get "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id={}&rettype=pubmed" > pubmed/{}.xml' + +    cat pubmed/*.xml | rg -v '^<\?xml version' | rg -v '^<!DOCTYPE' | rg -v '^<PubmedArticleSet>' | rg -v '^</PubmedArticleSet>' > pubmed_combined.xml + +    # Edit file manually to add `<PubmedArticleSet>` and `</PubmedArticleSet>` wrapper. + +    # in prod: +    ./fatcat_import.py pubmed --do-updates /tmp/pubmed_combined.xml /srv/fatcat/datasets/ISSN-to-ISSN-L.txt + | 
