diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2018-09-12 11:06:57 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2018-09-12 11:06:57 -0700 |
commit | eb05234037c6cb165e548a43859b77f9d5421189 (patch) | |
tree | 513da5f21b07b69a567367b6c189a783dfbe4303 | |
parent | 8baf1ecb97b376e0135bb637330571643019b5f7 (diff) | |
download | fatcat-eb05234037c6cb165e548a43859b77f9d5421189.tar.gz fatcat-eb05234037c6cb165e548a43859b77f9d5421189.zip |
extid map generation scripts+README
-rw-r--r-- | extra/extid_map/README.md | 109 | ||||
-rwxr-xr-x | extra/extid_map/load_core.py | 42 | ||||
-rwxr-xr-x | extra/extid_map/load_pmc.py | 47 | ||||
-rwxr-xr-x | extra/extid_map/load_wikidata.py | 43 |
4 files changed, 241 insertions, 0 deletions
diff --git a/extra/extid_map/README.md b/extra/extid_map/README.md new file mode 100644 index 00000000..d972ae25 --- /dev/null +++ b/extra/extid_map/README.md @@ -0,0 +1,109 @@ + +Process for generating a sqlite3 database file mapping between: + +- DOI +- Wikidata QID +- CORE database id +- PubMed Central id +- PubMed id + +This fast/indexed local database file, which is about a gigabyte compressed, is +useful for optimizing some fatcat 'release' entity imports and operations. + +There is an example database at <https://archive.org/details/ia_journal_pid_map_munge_20180908> + +## Data Sources + +EuropePMC mapping (more works than the USA PubMedCentral mapping) +- <ftp://ftp.ebi.ac.uk/pub/databases/pmc/DOI/PMID_PMCID_DOI.csv.gz> +- <https://archive.org/details/europepmc-id-map-2018-08-31> + +Wikicite data snapshot +- <https://archive.org/details/wikicite-biblio-data-20180903> + +CORE dataset +- <https://core.ac.uk/services#api> +- <https://archive.org/download/core_oa_metadata_20180301> + +## Wikidata Extract + +Early query: + + zcat wikidata-20180806-publications.ndjson.gz.crdownload | rg '"P356"' | jq '{qid: .id, doi: .claims.P356[0]}' -c | pv -l | head + +Polished: + + zcat wikidata-20180903-publications.ndjson.gz | rg '"P356"' | jq '[(.claims.P356[0] | ascii_downcase), .id] | @tsv' -cr | pv -l > doi_wikidata.20180903.tsv + + # 13.6M 0:26:13 [8.65k/s] + +Hrm, got this but seemed to continue: + + jq: error (at <stdin>:455616): explode input must be a string + +## CORE Extract + + xzcat core_json/*.json.xz | jq -rc 'select(.doi != null) | [(.doi | ascii_downcase), .coreId] | @tsv' | pv -l > doi_core.20180301.tsv + + # bnewbold@bnewbold-dev$ wc -l doi_core.20180301.tsv + # 28210779 doi_core.20180301.tsv + +## PMCID/PMID + +In a nice CSV format, no extract needed. + + bnewbold@bnewbold-dev$ zcat PMID_PMCID_DOI.csv.gz | rg doi.org | wc -l + 19441168 + +## sqlite schema + + CREATE TABLE ids (doi text not null, core int, pmid int, pmcid text, wikidata text); + CREATE UNIQUE INDEX ids_doi on ids (doi); + +Run CORE import first (largest mapping, thus fewest exception/update paths), +then pubmed, then wikidata. + +## Run import + + cat doi_core.20180301.tsv | ./load_core.py release_ids.db + => read 28210000, wrote 28210000 + => overnight? but seemed to run fast + + zcat doi_wikidata.20180903.tsv.gz | ./load_wikidata.py release_ids.db + => uhoh, seems very slow. lots of IOWAIT. switching to host with SSD + => wow, like 5-10x faster at least. sigh. + => ran in a few hours + + zcat PMID_PMCID_DOI.csv.gz | ./load_pmc.py release_ids.db + => read 29692000, wrote 19441136 + + + sqlite> select count(*) from ids; + 24831337 + + # only 2.3 million works have all IDs + sqlite> select count(*) from ids where wikidata not null and core not null and pmid not null; + 2314700 + + # almost (but not all) PubMedCentral items are PubMed items + sqlite> select count(*) from ids where pmid not null; + 19328761 + sqlite> select count(*) from ids where pmcid not null; + 3739951 + sqlite> select count(*) from ids where pmcid not null and pmid not null; + 3682486 + + # wikidata DOIs seem to mostly come from pmid mapping + sqlite> select count(*) from ids where wikidata not null; + 13608497 + sqlite> select count(*) from ids where pmid not null and wikidata not null; + 13179156 + + # core IDs are more independent (?) + sqlite> select count(*) from ids where core not null; + 7903910 + sqlite> select count(*) from ids where core not null and wikidata not null; + 2372369 + sqlite> select count(*) from ids where core not null and pmid not null; + 2783344 + diff --git a/extra/extid_map/load_core.py b/extra/extid_map/load_core.py new file mode 100755 index 00000000..a6a09e8a --- /dev/null +++ b/extra/extid_map/load_core.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python3 +""" +Run like: + + zcat CORE_FILE | ./load_core.py release_ids.db +""" + +import sys +import csv +import sqlite3 + +def run(db_path): + #db = sqlite3.connect("file:{}?mode=ro".format(db_path) + db = sqlite3.connect(db_path) + c = db.cursor() + count = 0 + inserted = 0 + for row in sys.stdin: + row = row.strip().split("\t") + if len(row) != 2: + continue + (doi, coreid) = row[:2] + if count % 1000 == 0: + print("read {}, wrote {}".format(count, inserted)) + db.commit() + count = count + 1 + coreid = int(coreid) + doi = doi.lower() + # UPSERTS were only added to sqlite3 in summer 2018 (not in xenial version) + try: + c.execute("""INSERT INTO ids (doi, core) VALUES (?, ?)""", (doi, coreid)) + except sqlite3.IntegrityError: + c.execute("""UPDATE ids SET core = ? WHERE doi = ?""", (coreid, doi)) + inserted = inserted + 1 + db.commit() + db.close() + +if __name__=="__main__": + if len(sys.argv) != 2: + print("Need single argument: db_path") + sys.exit(-1) + run(sys.argv[1]) diff --git a/extra/extid_map/load_pmc.py b/extra/extid_map/load_pmc.py new file mode 100755 index 00000000..5454541e --- /dev/null +++ b/extra/extid_map/load_pmc.py @@ -0,0 +1,47 @@ +#!/usr/bin/env python3 +""" +Run like: + + zcat PMID_PMCID_DOI.csv.gz | ./load_pmc.py release_ids.db +""" + +import sys +import csv +import sqlite3 + +def run(db_path): + #db = sqlite3.connect("file:{}?mode=ro".format(db_path) + db = sqlite3.connect(db_path) + reader = csv.reader(sys.stdin) + c = db.cursor() + count = 0 + inserted = 0 + for row in reader: + (pmid, pmcid, doi) = row + if count % 1000 == 0: + print("read {}, wrote {}".format(count, inserted)) + db.commit() + count = count + 1 + if not doi.startswith("http"): + continue + doi = doi.replace("https://doi.org/", "").lower() + if pmcid == '': + pmcid = None + if pmid == '': + pmid = None + else: + pmid = int(pmid) + # UPSERTS were only added to sqlite3 in summer 2018 (not in xenial version) + try: + c.execute("""INSERT INTO ids (doi, pmid, pmcid) VALUES (?, ?, ?)""", (doi, pmid, pmcid)) + except sqlite3.IntegrityError: + c.execute("""UPDATE ids SET pmid = ?, pmcid = ? WHERE doi = ?""", (pmid, pmcid, doi)) + inserted = inserted + 1 + db.commit() + db.close() + +if __name__=="__main__": + if len(sys.argv) != 2: + print("Need single argument: db_path") + sys.exit(-1) + run(sys.argv[1]) diff --git a/extra/extid_map/load_wikidata.py b/extra/extid_map/load_wikidata.py new file mode 100755 index 00000000..8d29a58a --- /dev/null +++ b/extra/extid_map/load_wikidata.py @@ -0,0 +1,43 @@ +#!/usr/bin/env python3 +""" +Run like: + + zcat doi_wikidata.20180903.tsv.gz | ./load_wikidata.py release_ids.db +""" + +import sys +import csv +import sqlite3 + +def run(db_path): + #db = sqlite3.connect("file:{}?mode=ro".format(db_path) + db = sqlite3.connect(db_path) + c = db.cursor() + count = 0 + inserted = 0 + for row in sys.stdin: + row = row.strip().split("\t") + if len(row) != 2: + continue + (doi, qid) = row[:2] + if count % 1000 == 0: + print("read {}, wrote {}".format(count, inserted)) + db.commit() + count = count + 1 + if not doi.startswith("10.") or not qid.startswith('Q'): + continue + doi = doi.lower() + # UPSERTS were only added to sqlite3 in summer 2018 (not in xenial version) + try: + c.execute("""INSERT INTO ids (doi, wikidata) VALUES (?, ?)""", (doi, qid)) + except sqlite3.IntegrityError: + c.execute("""UPDATE ids SET wikidata = ? WHERE doi = ?""", (qid, doi)) + inserted = inserted + 1 + db.commit() + db.close() + +if __name__=="__main__": + if len(sys.argv) != 2: + print("Need single argument: db_path") + sys.exit(-1) + run(sys.argv[1]) |