summaryrefslogtreecommitdiffstats
path: root/extra/extid_map/load_wikidata.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2018-09-12 11:06:57 -0700
committerBryan Newbold <bnewbold@robocracy.org>2018-09-12 11:06:57 -0700
commiteb05234037c6cb165e548a43859b77f9d5421189 (patch)
tree513da5f21b07b69a567367b6c189a783dfbe4303 /extra/extid_map/load_wikidata.py
parent8baf1ecb97b376e0135bb637330571643019b5f7 (diff)
downloadfatcat-eb05234037c6cb165e548a43859b77f9d5421189.tar.gz
fatcat-eb05234037c6cb165e548a43859b77f9d5421189.zip
extid map generation scripts+README
Diffstat (limited to 'extra/extid_map/load_wikidata.py')
-rwxr-xr-xextra/extid_map/load_wikidata.py43
1 files changed, 43 insertions, 0 deletions
diff --git a/extra/extid_map/load_wikidata.py b/extra/extid_map/load_wikidata.py
new file mode 100755
index 00000000..8d29a58a
--- /dev/null
+++ b/extra/extid_map/load_wikidata.py
@@ -0,0 +1,43 @@
+#!/usr/bin/env python3
+"""
+Run like:
+
+ zcat doi_wikidata.20180903.tsv.gz | ./load_wikidata.py release_ids.db
+"""
+
+import sys
+import csv
+import sqlite3
+
+def run(db_path):
+ #db = sqlite3.connect("file:{}?mode=ro".format(db_path)
+ db = sqlite3.connect(db_path)
+ c = db.cursor()
+ count = 0
+ inserted = 0
+ for row in sys.stdin:
+ row = row.strip().split("\t")
+ if len(row) != 2:
+ continue
+ (doi, qid) = row[:2]
+ if count % 1000 == 0:
+ print("read {}, wrote {}".format(count, inserted))
+ db.commit()
+ count = count + 1
+ if not doi.startswith("10.") or not qid.startswith('Q'):
+ continue
+ doi = doi.lower()
+ # UPSERTS were only added to sqlite3 in summer 2018 (not in xenial version)
+ try:
+ c.execute("""INSERT INTO ids (doi, wikidata) VALUES (?, ?)""", (doi, qid))
+ except sqlite3.IntegrityError:
+ c.execute("""UPDATE ids SET wikidata = ? WHERE doi = ?""", (qid, doi))
+ inserted = inserted + 1
+ db.commit()
+ db.close()
+
+if __name__=="__main__":
+ if len(sys.argv) != 2:
+ print("Need single argument: db_path")
+ sys.exit(-1)
+ run(sys.argv[1])