From eb05234037c6cb165e548a43859b77f9d5421189 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 12 Sep 2018 11:06:57 -0700 Subject: extid map generation scripts+README --- extra/extid_map/README.md | 109 +++++++++++++++++++++++++++++++++++++++ extra/extid_map/load_core.py | 42 +++++++++++++++ extra/extid_map/load_pmc.py | 47 +++++++++++++++++ extra/extid_map/load_wikidata.py | 43 +++++++++++++++ 4 files changed, 241 insertions(+) create mode 100644 extra/extid_map/README.md create mode 100755 extra/extid_map/load_core.py create mode 100755 extra/extid_map/load_pmc.py create mode 100755 extra/extid_map/load_wikidata.py (limited to 'extra') diff --git a/extra/extid_map/README.md b/extra/extid_map/README.md new file mode 100644 index 00000000..d972ae25 --- /dev/null +++ b/extra/extid_map/README.md @@ -0,0 +1,109 @@ + +Process for generating a sqlite3 database file mapping between: + +- DOI +- Wikidata QID +- CORE database id +- PubMed Central id +- PubMed id + +This fast/indexed local database file, which is about a gigabyte compressed, is +useful for optimizing some fatcat 'release' entity imports and operations. + +There is an example database at + +## Data Sources + +EuropePMC mapping (more works than the USA PubMedCentral mapping) +- +- + +Wikicite data snapshot +- + +CORE dataset +- +- + +## Wikidata Extract + +Early query: + + zcat wikidata-20180806-publications.ndjson.gz.crdownload | rg '"P356"' | jq '{qid: .id, doi: .claims.P356[0]}' -c | pv -l | head + +Polished: + + zcat wikidata-20180903-publications.ndjson.gz | rg '"P356"' | jq '[(.claims.P356[0] | ascii_downcase), .id] | @tsv' -cr | pv -l > doi_wikidata.20180903.tsv + + # 13.6M 0:26:13 [8.65k/s] + +Hrm, got this but seemed to continue: + + jq: error (at :455616): explode input must be a string + +## CORE Extract + + xzcat core_json/*.json.xz | jq -rc 'select(.doi != null) | [(.doi | ascii_downcase), .coreId] | @tsv' | pv -l > doi_core.20180301.tsv + + # bnewbold@bnewbold-dev$ wc -l doi_core.20180301.tsv + # 28210779 doi_core.20180301.tsv + +## PMCID/PMID + +In a nice CSV format, no extract needed. + + bnewbold@bnewbold-dev$ zcat PMID_PMCID_DOI.csv.gz | rg doi.org | wc -l + 19441168 + +## sqlite schema + + CREATE TABLE ids (doi text not null, core int, pmid int, pmcid text, wikidata text); + CREATE UNIQUE INDEX ids_doi on ids (doi); + +Run CORE import first (largest mapping, thus fewest exception/update paths), +then pubmed, then wikidata. + +## Run import + + cat doi_core.20180301.tsv | ./load_core.py release_ids.db + => read 28210000, wrote 28210000 + => overnight? but seemed to run fast + + zcat doi_wikidata.20180903.tsv.gz | ./load_wikidata.py release_ids.db + => uhoh, seems very slow. lots of IOWAIT. switching to host with SSD + => wow, like 5-10x faster at least. sigh. + => ran in a few hours + + zcat PMID_PMCID_DOI.csv.gz | ./load_pmc.py release_ids.db + => read 29692000, wrote 19441136 + + + sqlite> select count(*) from ids; + 24831337 + + # only 2.3 million works have all IDs + sqlite> select count(*) from ids where wikidata not null and core not null and pmid not null; + 2314700 + + # almost (but not all) PubMedCentral items are PubMed items + sqlite> select count(*) from ids where pmid not null; + 19328761 + sqlite> select count(*) from ids where pmcid not null; + 3739951 + sqlite> select count(*) from ids where pmcid not null and pmid not null; + 3682486 + + # wikidata DOIs seem to mostly come from pmid mapping + sqlite> select count(*) from ids where wikidata not null; + 13608497 + sqlite> select count(*) from ids where pmid not null and wikidata not null; + 13179156 + + # core IDs are more independent (?) + sqlite> select count(*) from ids where core not null; + 7903910 + sqlite> select count(*) from ids where core not null and wikidata not null; + 2372369 + sqlite> select count(*) from ids where core not null and pmid not null; + 2783344 + diff --git a/extra/extid_map/load_core.py b/extra/extid_map/load_core.py new file mode 100755 index 00000000..a6a09e8a --- /dev/null +++ b/extra/extid_map/load_core.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python3 +""" +Run like: + + zcat CORE_FILE | ./load_core.py release_ids.db +""" + +import sys +import csv +import sqlite3 + +def run(db_path): + #db = sqlite3.connect("file:{}?mode=ro".format(db_path) + db = sqlite3.connect(db_path) + c = db.cursor() + count = 0 + inserted = 0 + for row in sys.stdin: + row = row.strip().split("\t") + if len(row) != 2: + continue + (doi, coreid) = row[:2] + if count % 1000 == 0: + print("read {}, wrote {}".format(count, inserted)) + db.commit() + count = count + 1 + coreid = int(coreid) + doi = doi.lower() + # UPSERTS were only added to sqlite3 in summer 2018 (not in xenial version) + try: + c.execute("""INSERT INTO ids (doi, core) VALUES (?, ?)""", (doi, coreid)) + except sqlite3.IntegrityError: + c.execute("""UPDATE ids SET core = ? WHERE doi = ?""", (coreid, doi)) + inserted = inserted + 1 + db.commit() + db.close() + +if __name__=="__main__": + if len(sys.argv) != 2: + print("Need single argument: db_path") + sys.exit(-1) + run(sys.argv[1]) diff --git a/extra/extid_map/load_pmc.py b/extra/extid_map/load_pmc.py new file mode 100755 index 00000000..5454541e --- /dev/null +++ b/extra/extid_map/load_pmc.py @@ -0,0 +1,47 @@ +#!/usr/bin/env python3 +""" +Run like: + + zcat PMID_PMCID_DOI.csv.gz | ./load_pmc.py release_ids.db +""" + +import sys +import csv +import sqlite3 + +def run(db_path): + #db = sqlite3.connect("file:{}?mode=ro".format(db_path) + db = sqlite3.connect(db_path) + reader = csv.reader(sys.stdin) + c = db.cursor() + count = 0 + inserted = 0 + for row in reader: + (pmid, pmcid, doi) = row + if count % 1000 == 0: + print("read {}, wrote {}".format(count, inserted)) + db.commit() + count = count + 1 + if not doi.startswith("http"): + continue + doi = doi.replace("https://doi.org/", "").lower() + if pmcid == '': + pmcid = None + if pmid == '': + pmid = None + else: + pmid = int(pmid) + # UPSERTS were only added to sqlite3 in summer 2018 (not in xenial version) + try: + c.execute("""INSERT INTO ids (doi, pmid, pmcid) VALUES (?, ?, ?)""", (doi, pmid, pmcid)) + except sqlite3.IntegrityError: + c.execute("""UPDATE ids SET pmid = ?, pmcid = ? WHERE doi = ?""", (pmid, pmcid, doi)) + inserted = inserted + 1 + db.commit() + db.close() + +if __name__=="__main__": + if len(sys.argv) != 2: + print("Need single argument: db_path") + sys.exit(-1) + run(sys.argv[1]) diff --git a/extra/extid_map/load_wikidata.py b/extra/extid_map/load_wikidata.py new file mode 100755 index 00000000..8d29a58a --- /dev/null +++ b/extra/extid_map/load_wikidata.py @@ -0,0 +1,43 @@ +#!/usr/bin/env python3 +""" +Run like: + + zcat doi_wikidata.20180903.tsv.gz | ./load_wikidata.py release_ids.db +""" + +import sys +import csv +import sqlite3 + +def run(db_path): + #db = sqlite3.connect("file:{}?mode=ro".format(db_path) + db = sqlite3.connect(db_path) + c = db.cursor() + count = 0 + inserted = 0 + for row in sys.stdin: + row = row.strip().split("\t") + if len(row) != 2: + continue + (doi, qid) = row[:2] + if count % 1000 == 0: + print("read {}, wrote {}".format(count, inserted)) + db.commit() + count = count + 1 + if not doi.startswith("10.") or not qid.startswith('Q'): + continue + doi = doi.lower() + # UPSERTS were only added to sqlite3 in summer 2018 (not in xenial version) + try: + c.execute("""INSERT INTO ids (doi, wikidata) VALUES (?, ?)""", (doi, qid)) + except sqlite3.IntegrityError: + c.execute("""UPDATE ids SET wikidata = ? WHERE doi = ?""", (qid, doi)) + inserted = inserted + 1 + db.commit() + db.close() + +if __name__=="__main__": + if len(sys.argv) != 2: + print("Need single argument: db_path") + sys.exit(-1) + run(sys.argv[1]) -- cgit v1.2.3