blob: 8d29a58a63ce28e009f2313c2b5d16f30642fc4a (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
|
#!/usr/bin/env python3
"""
Run like:
zcat doi_wikidata.20180903.tsv.gz | ./load_wikidata.py release_ids.db
"""
import sys
import csv
import sqlite3
def run(db_path):
#db = sqlite3.connect("file:{}?mode=ro".format(db_path)
db = sqlite3.connect(db_path)
c = db.cursor()
count = 0
inserted = 0
for row in sys.stdin:
row = row.strip().split("\t")
if len(row) != 2:
continue
(doi, qid) = row[:2]
if count % 1000 == 0:
print("read {}, wrote {}".format(count, inserted))
db.commit()
count = count + 1
if not doi.startswith("10.") or not qid.startswith('Q'):
continue
doi = doi.lower()
# UPSERTS were only added to sqlite3 in summer 2018 (not in xenial version)
try:
c.execute("""INSERT INTO ids (doi, wikidata) VALUES (?, ?)""", (doi, qid))
except sqlite3.IntegrityError:
c.execute("""UPDATE ids SET wikidata = ? WHERE doi = ?""", (qid, doi))
inserted = inserted + 1
db.commit()
db.close()
if __name__=="__main__":
if len(sys.argv) != 2:
print("Need single argument: db_path")
sys.exit(-1)
run(sys.argv[1])
|