diff options
-rw-r--r-- | notes/missing_2020-03-20.md | 23 |
1 files changed, 23 insertions, 0 deletions
diff --git a/notes/missing_2020-03-20.md b/notes/missing_2020-03-20.md index fa2705b..2576a8e 100644 --- a/notes/missing_2020-03-20.md +++ b/notes/missing_2020-03-20.md @@ -280,3 +280,26 @@ Interesting sites to crawl or translate: => commerical/national holder of 40+ million papers => indexed by EBSCO +## Fetching Metata + + cat metadata/cord19.2020-03-27.missing.json | jq 'select(.doi != "") | .doi' -r | sort -u > missing_doi.tsv + cat metadata/cord19.2020-03-27.missing.json | jq 'select(.pubmed_id != "") | .pubmed_id' -r | sort -u > missing_pmid.tsv + cat metadata/cord19.2020-03-27.missing.json | jq 'select(.pmcid != "") | .pmcid' -r | sort -u > missing_pmcid.tsv + + cat missing_doi.tsv | parallel -j4 'http --headers head "https://doi.org/{}" | head -n1 | awk "{print \"{}\t\" \$2}"' > missing_doi_status.tsv + + cat missing_doi_status.tsv | rg '404$' | cut -f1 > unregistered_doi.tsv + + cat missing_doi_status.tsv | rg '302$' | cut -f1 | parallel http --json get "https://api.crossref.org/v1/works/http://dx.doi.org/{}" | jq .message -c | pv -l > missing_doi_crossref.json + + mkdir -p pubmed + cat missing_pmcid.tsv | parallel -j1 'http get "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id={}&rettype=pubmed" > pubmed/{}.xml' + cat missing_pmid.tsv | parallel -j1 'http get "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id={}&rettype=pubmed" > pubmed/{}.xml' + + cat pubmed/*.xml | rg -v '^<\?xml version' | rg -v '^<!DOCTYPE' | rg -v '^<PubmedArticleSet>' | rg -v '^</PubmedArticleSet>' > pubmed_combined.xml + + # Edit file manually to add `<PubmedArticleSet>` and `</PubmedArticleSet>` wrapper. + + # in prod: + ./fatcat_import.py pubmed --do-updates /tmp/pubmed_combined.xml /srv/fatcat/datasets/ISSN-to-ISSN-L.txt + |