aboutsummaryrefslogtreecommitdiffstats
path: root/extra/extid_map/load_wikidata.py
blob: 8d29a58a63ce28e009f2313c2b5d16f30642fc4a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
#!/usr/bin/env python3
"""
Run like:

    zcat doi_wikidata.20180903.tsv.gz | ./load_wikidata.py release_ids.db
"""

import sys
import csv
import sqlite3

def run(db_path):
    #db = sqlite3.connect("file:{}?mode=ro".format(db_path)
    db = sqlite3.connect(db_path)
    c = db.cursor()
    count = 0
    inserted = 0
    for row in sys.stdin:
        row = row.strip().split("\t")
        if len(row) != 2:
            continue
        (doi, qid) = row[:2]
        if count % 1000 == 0:
            print("read {}, wrote {}".format(count, inserted))
            db.commit()
        count = count + 1
        if not doi.startswith("10.") or not qid.startswith('Q'):
            continue
        doi = doi.lower()
        # UPSERTS were only added to sqlite3 in summer 2018 (not in xenial version)
        try:
            c.execute("""INSERT INTO ids (doi, wikidata) VALUES (?, ?)""", (doi, qid))
        except sqlite3.IntegrityError:
            c.execute("""UPDATE ids SET wikidata = ? WHERE doi = ?""", (qid, doi))
        inserted = inserted + 1
    db.commit()
    db.close()

if __name__=="__main__":
    if len(sys.argv) != 2:
        print("Need single argument: db_path")
        sys.exit(-1)
    run(sys.argv[1])