aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2018-09-14 16:57:37 -0700
committerBryan Newbold <bnewbold@archive.org>2018-09-14 16:57:37 -0700
commit8e67baf622daa21ceca1b7cbf13f5461d9d8029a (patch)
tree488267b00ee483a48d9dde03ee2785f15d919f65 /python
parentf6b9afe760ef4f5b1d06d99d3f53028745f48124 (diff)
downloadsandcrawler-8e67baf622daa21ceca1b7cbf13f5461d9d8029a.tar.gz
sandcrawler-8e67baf622daa21ceca1b7cbf13f5461d9d8029a.zip
add manifest sqlite3 -> JSON converter
Diffstat (limited to 'python')
-rwxr-xr-xpython/manifest_converter.py58
1 files changed, 58 insertions, 0 deletions
diff --git a/python/manifest_converter.py b/python/manifest_converter.py
new file mode 100755
index 0000000..f0d0bc7
--- /dev/null
+++ b/python/manifest_converter.py
@@ -0,0 +1,58 @@
+#!/usr/bin/env python3
+"""
+Reads a sqlite3 manifest database (IA 2017 style) and outputs a stream of
+"match" JSON objects which can be imported into fatcat with matched_import.py
+
+This was used to convert this manifest:
+
+ https://archive.org/details/ia_papers_manifest_2018-01-25/
+
+to JSON format for fast fatcat importing.
+"""
+
+import sys
+import json
+import sqlite3
+import itertools
+
+# iterate over rows in files metadata...
+# 1. select all identified DOIs
+# => filter based on count
+# 2. select all file metadata
+# 3. output object
+
+def or_none(s):
+ if s is None:
+ return None
+ elif type(s) == str and (len(s) == 0 or s == "\\N" or s == "-"):
+ return None
+ else:
+ return s
+
+def process_db(db_path):
+
+ db = sqlite3.connect(db_path)
+
+ for row in db.execute("SELECT sha1, mimetype, size_bytes, md5 FROM files_metadata"):
+ sha1 = row[0]
+ dois = db.execute("SELECT doi FROM files_id_doi WHERE sha1=?", [sha1]).fetchall()
+ dois = [d[0] for d in dois]
+ if len(dois) == 0:
+ continue
+ urls = db.execute("SELECT url, datetime FROM urls WHERE sha1=?", [sha1]).fetchall()
+ if len(urls) == 0:
+ continue
+ cdx = [dict(url=row[0], dt=row[1]) for row in urls]
+ obj = dict(
+ sha1=sha1,
+ mimetype=or_none(row[1]),
+ size=(or_none(row[2]) and int(row[2])),
+ md5=or_none(row[3]),
+ dois=dois,
+ cdx=cdx,
+ )
+ dois = db.execute("SELECT doi FROM files_id_doi WHERE sha1=?", [sha1])
+ print(json.dumps(obj))
+
+if __name__=="__main__":
+ process_db(sys.argv[1])