diff options
| author | Bryan Newbold <bnewbold@robocracy.org> | 2018-09-12 11:06:57 -0700 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@robocracy.org> | 2018-09-12 11:06:57 -0700 | 
| commit | eb05234037c6cb165e548a43859b77f9d5421189 (patch) | |
| tree | 513da5f21b07b69a567367b6c189a783dfbe4303 | |
| parent | 8baf1ecb97b376e0135bb637330571643019b5f7 (diff) | |
| download | fatcat-eb05234037c6cb165e548a43859b77f9d5421189.tar.gz fatcat-eb05234037c6cb165e548a43859b77f9d5421189.zip | |
extid map generation scripts+README
| -rw-r--r-- | extra/extid_map/README.md | 109 | ||||
| -rwxr-xr-x | extra/extid_map/load_core.py | 42 | ||||
| -rwxr-xr-x | extra/extid_map/load_pmc.py | 47 | ||||
| -rwxr-xr-x | extra/extid_map/load_wikidata.py | 43 | 
4 files changed, 241 insertions, 0 deletions
| diff --git a/extra/extid_map/README.md b/extra/extid_map/README.md new file mode 100644 index 00000000..d972ae25 --- /dev/null +++ b/extra/extid_map/README.md @@ -0,0 +1,109 @@ + +Process for generating a sqlite3 database file mapping between: + +- DOI +- Wikidata QID +- CORE database id +- PubMed Central id +- PubMed id + +This fast/indexed local database file, which is about a gigabyte compressed, is +useful for optimizing some fatcat 'release' entity imports and operations. + +There is an example database at <https://archive.org/details/ia_journal_pid_map_munge_20180908> + +## Data Sources + +EuropePMC mapping (more works than the USA PubMedCentral mapping) +- <ftp://ftp.ebi.ac.uk/pub/databases/pmc/DOI/PMID_PMCID_DOI.csv.gz> +- <https://archive.org/details/europepmc-id-map-2018-08-31> + +Wikicite data snapshot +- <https://archive.org/details/wikicite-biblio-data-20180903> + +CORE dataset +- <https://core.ac.uk/services#api> +- <https://archive.org/download/core_oa_metadata_20180301> + +## Wikidata Extract + +Early query: + +    zcat wikidata-20180806-publications.ndjson.gz.crdownload | rg '"P356"' | jq '{qid: .id, doi: .claims.P356[0]}' -c | pv -l | head + +Polished: + +    zcat wikidata-20180903-publications.ndjson.gz | rg '"P356"' | jq '[(.claims.P356[0] | ascii_downcase), .id] | @tsv' -cr | pv -l > doi_wikidata.20180903.tsv + +    # 13.6M 0:26:13 [8.65k/s] + +Hrm, got this but seemed to continue: + +    jq: error (at <stdin>:455616): explode input must be a string + +## CORE Extract + +    xzcat core_json/*.json.xz | jq -rc 'select(.doi != null) | [(.doi | ascii_downcase), .coreId] | @tsv' | pv -l > doi_core.20180301.tsv + +    # bnewbold@bnewbold-dev$ wc -l doi_core.20180301.tsv +    # 28210779 doi_core.20180301.tsv + +## PMCID/PMID + +In a nice CSV format, no extract needed. + +    bnewbold@bnewbold-dev$ zcat PMID_PMCID_DOI.csv.gz | rg doi.org | wc -l +    19441168 + +## sqlite schema + +    CREATE TABLE ids (doi text not null, core int, pmid int, pmcid text, wikidata text); +    CREATE UNIQUE INDEX ids_doi on ids (doi); + +Run CORE import first (largest mapping, thus fewest exception/update paths), +then pubmed, then wikidata. + +## Run import + +    cat doi_core.20180301.tsv | ./load_core.py release_ids.db +    => read 28210000, wrote 28210000 +    => overnight? but seemed to run fast + +    zcat doi_wikidata.20180903.tsv.gz | ./load_wikidata.py release_ids.db +    => uhoh, seems very slow. lots of IOWAIT. switching to host with SSD +    => wow, like 5-10x faster at least. sigh. +    => ran in a few hours + +    zcat PMID_PMCID_DOI.csv.gz | ./load_pmc.py release_ids.db +    => read 29692000, wrote 19441136 + + +    sqlite> select count(*) from ids; +    24831337 + +    # only 2.3 million works have all IDs +    sqlite> select count(*) from ids where wikidata not null and core not null and pmid not null; +    2314700 + +    # almost (but not all) PubMedCentral items are PubMed items +    sqlite> select count(*) from ids where pmid not null; +    19328761 +    sqlite> select count(*) from ids where pmcid not null; +    3739951 +    sqlite> select count(*) from ids where pmcid not null and pmid not null; +    3682486 + +    # wikidata DOIs seem to mostly come from pmid mapping +    sqlite> select count(*) from ids where wikidata not null; +    13608497 +    sqlite> select count(*) from ids where pmid not null and wikidata not null; +    13179156 + +    # core IDs are more independent (?) +    sqlite> select count(*) from ids where core not null; +    7903910 +    sqlite> select count(*) from ids where core not null and wikidata not null; +    2372369 +    sqlite> select count(*) from ids where core not null and pmid not null; +    2783344 + diff --git a/extra/extid_map/load_core.py b/extra/extid_map/load_core.py new file mode 100755 index 00000000..a6a09e8a --- /dev/null +++ b/extra/extid_map/load_core.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python3 +""" +Run like: + +    zcat CORE_FILE | ./load_core.py release_ids.db +""" + +import sys +import csv +import sqlite3 + +def run(db_path): +    #db = sqlite3.connect("file:{}?mode=ro".format(db_path) +    db = sqlite3.connect(db_path) +    c = db.cursor() +    count = 0 +    inserted = 0 +    for row in sys.stdin: +        row = row.strip().split("\t") +        if len(row) != 2: +            continue +        (doi, coreid) = row[:2] +        if count % 1000 == 0: +            print("read {}, wrote {}".format(count, inserted)) +            db.commit() +        count = count + 1 +        coreid = int(coreid) +        doi = doi.lower() +        # UPSERTS were only added to sqlite3 in summer 2018 (not in xenial version) +        try: +            c.execute("""INSERT INTO ids (doi, core) VALUES (?, ?)""", (doi, coreid)) +        except sqlite3.IntegrityError: +            c.execute("""UPDATE ids SET core = ? WHERE doi = ?""", (coreid, doi)) +        inserted = inserted + 1 +    db.commit() +    db.close() + +if __name__=="__main__": +    if len(sys.argv) != 2: +        print("Need single argument: db_path") +        sys.exit(-1) +    run(sys.argv[1]) diff --git a/extra/extid_map/load_pmc.py b/extra/extid_map/load_pmc.py new file mode 100755 index 00000000..5454541e --- /dev/null +++ b/extra/extid_map/load_pmc.py @@ -0,0 +1,47 @@ +#!/usr/bin/env python3 +""" +Run like: + +    zcat PMID_PMCID_DOI.csv.gz | ./load_pmc.py release_ids.db +""" + +import sys +import csv +import sqlite3 + +def run(db_path): +    #db = sqlite3.connect("file:{}?mode=ro".format(db_path) +    db = sqlite3.connect(db_path) +    reader = csv.reader(sys.stdin) +    c = db.cursor() +    count = 0 +    inserted = 0 +    for row in reader: +        (pmid, pmcid, doi) = row +        if count % 1000 == 0: +            print("read {}, wrote {}".format(count, inserted)) +            db.commit() +        count = count + 1 +        if not doi.startswith("http"): +            continue +        doi = doi.replace("https://doi.org/", "").lower() +        if pmcid == '': +            pmcid = None +        if pmid == '': +            pmid = None +        else: +            pmid = int(pmid) +        # UPSERTS were only added to sqlite3 in summer 2018 (not in xenial version) +        try: +            c.execute("""INSERT INTO ids (doi, pmid, pmcid) VALUES (?, ?, ?)""", (doi, pmid, pmcid)) +        except sqlite3.IntegrityError: +            c.execute("""UPDATE ids SET pmid = ?, pmcid = ? WHERE doi = ?""", (pmid, pmcid, doi)) +        inserted = inserted + 1 +    db.commit() +    db.close() + +if __name__=="__main__": +    if len(sys.argv) != 2: +        print("Need single argument: db_path") +        sys.exit(-1) +    run(sys.argv[1]) diff --git a/extra/extid_map/load_wikidata.py b/extra/extid_map/load_wikidata.py new file mode 100755 index 00000000..8d29a58a --- /dev/null +++ b/extra/extid_map/load_wikidata.py @@ -0,0 +1,43 @@ +#!/usr/bin/env python3 +""" +Run like: + +    zcat doi_wikidata.20180903.tsv.gz | ./load_wikidata.py release_ids.db +""" + +import sys +import csv +import sqlite3 + +def run(db_path): +    #db = sqlite3.connect("file:{}?mode=ro".format(db_path) +    db = sqlite3.connect(db_path) +    c = db.cursor() +    count = 0 +    inserted = 0 +    for row in sys.stdin: +        row = row.strip().split("\t") +        if len(row) != 2: +            continue +        (doi, qid) = row[:2] +        if count % 1000 == 0: +            print("read {}, wrote {}".format(count, inserted)) +            db.commit() +        count = count + 1 +        if not doi.startswith("10.") or not qid.startswith('Q'): +            continue +        doi = doi.lower() +        # UPSERTS were only added to sqlite3 in summer 2018 (not in xenial version) +        try: +            c.execute("""INSERT INTO ids (doi, wikidata) VALUES (?, ?)""", (doi, qid)) +        except sqlite3.IntegrityError: +            c.execute("""UPDATE ids SET wikidata = ? WHERE doi = ?""", (qid, doi)) +        inserted = inserted + 1 +    db.commit() +    db.close() + +if __name__=="__main__": +    if len(sys.argv) != 2: +        print("Need single argument: db_path") +        sys.exit(-1) +    run(sys.argv[1]) | 
