aboutsummaryrefslogtreecommitdiffstats
path: root/notes
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-03-30 09:48:39 -0700
committerBryan Newbold <bnewbold@archive.org>2020-03-30 09:48:39 -0700
commit0cf608debcd672f9a3c54cb8d4ac1caf686ce2e3 (patch)
treecac4f85557e63b6b8846c60216741911a002c410 /notes
parente8ff943fbbb6af445374e949c821cc1562e0fefe (diff)
downloadfatcat-covid19-0cf608debcd672f9a3c54cb8d4ac1caf686ce2e3.tar.gz
fatcat-covid19-0cf608debcd672f9a3c54cb8d4ac1caf686ce2e3.zip
missing: patching metadata for missing fatcat records
Diffstat (limited to 'notes')
-rw-r--r--notes/missing_2020-03-20.md23
1 files changed, 23 insertions, 0 deletions
diff --git a/notes/missing_2020-03-20.md b/notes/missing_2020-03-20.md
index fa2705b..2576a8e 100644
--- a/notes/missing_2020-03-20.md
+++ b/notes/missing_2020-03-20.md
@@ -280,3 +280,26 @@ Interesting sites to crawl or translate:
=> commerical/national holder of 40+ million papers
=> indexed by EBSCO
+## Fetching Metata
+
+ cat metadata/cord19.2020-03-27.missing.json | jq 'select(.doi != "") | .doi' -r | sort -u > missing_doi.tsv
+ cat metadata/cord19.2020-03-27.missing.json | jq 'select(.pubmed_id != "") | .pubmed_id' -r | sort -u > missing_pmid.tsv
+ cat metadata/cord19.2020-03-27.missing.json | jq 'select(.pmcid != "") | .pmcid' -r | sort -u > missing_pmcid.tsv
+
+ cat missing_doi.tsv | parallel -j4 'http --headers head "https://doi.org/{}" | head -n1 | awk "{print \"{}\t\" \$2}"' > missing_doi_status.tsv
+
+ cat missing_doi_status.tsv | rg '404$' | cut -f1 > unregistered_doi.tsv
+
+ cat missing_doi_status.tsv | rg '302$' | cut -f1 | parallel http --json get "https://api.crossref.org/v1/works/http://dx.doi.org/{}" | jq .message -c | pv -l > missing_doi_crossref.json
+
+ mkdir -p pubmed
+ cat missing_pmcid.tsv | parallel -j1 'http get "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id={}&rettype=pubmed" > pubmed/{}.xml'
+ cat missing_pmid.tsv | parallel -j1 'http get "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id={}&rettype=pubmed" > pubmed/{}.xml'
+
+ cat pubmed/*.xml | rg -v '^<\?xml version' | rg -v '^<!DOCTYPE' | rg -v '^<PubmedArticleSet>' | rg -v '^</PubmedArticleSet>' > pubmed_combined.xml
+
+ # Edit file manually to add `<PubmedArticleSet>` and `</PubmedArticleSet>` wrapper.
+
+ # in prod:
+ ./fatcat_import.py pubmed --do-updates /tmp/pubmed_combined.xml /srv/fatcat/datasets/ISSN-to-ISSN-L.txt
+