diff options
author | Bryan Newbold <bnewbold@archive.org> | 2018-09-14 16:57:37 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2018-09-14 16:57:37 -0700 |
commit | 8e67baf622daa21ceca1b7cbf13f5461d9d8029a (patch) | |
tree | 488267b00ee483a48d9dde03ee2785f15d919f65 /python | |
parent | f6b9afe760ef4f5b1d06d99d3f53028745f48124 (diff) | |
download | sandcrawler-8e67baf622daa21ceca1b7cbf13f5461d9d8029a.tar.gz sandcrawler-8e67baf622daa21ceca1b7cbf13f5461d9d8029a.zip |
add manifest sqlite3 -> JSON converter
Diffstat (limited to 'python')
-rwxr-xr-x | python/manifest_converter.py | 58 |
1 files changed, 58 insertions, 0 deletions
diff --git a/python/manifest_converter.py b/python/manifest_converter.py new file mode 100755 index 0000000..f0d0bc7 --- /dev/null +++ b/python/manifest_converter.py @@ -0,0 +1,58 @@ +#!/usr/bin/env python3 +""" +Reads a sqlite3 manifest database (IA 2017 style) and outputs a stream of +"match" JSON objects which can be imported into fatcat with matched_import.py + +This was used to convert this manifest: + + https://archive.org/details/ia_papers_manifest_2018-01-25/ + +to JSON format for fast fatcat importing. +""" + +import sys +import json +import sqlite3 +import itertools + +# iterate over rows in files metadata... +# 1. select all identified DOIs +# => filter based on count +# 2. select all file metadata +# 3. output object + +def or_none(s): + if s is None: + return None + elif type(s) == str and (len(s) == 0 or s == "\\N" or s == "-"): + return None + else: + return s + +def process_db(db_path): + + db = sqlite3.connect(db_path) + + for row in db.execute("SELECT sha1, mimetype, size_bytes, md5 FROM files_metadata"): + sha1 = row[0] + dois = db.execute("SELECT doi FROM files_id_doi WHERE sha1=?", [sha1]).fetchall() + dois = [d[0] for d in dois] + if len(dois) == 0: + continue + urls = db.execute("SELECT url, datetime FROM urls WHERE sha1=?", [sha1]).fetchall() + if len(urls) == 0: + continue + cdx = [dict(url=row[0], dt=row[1]) for row in urls] + obj = dict( + sha1=sha1, + mimetype=or_none(row[1]), + size=(or_none(row[2]) and int(row[2])), + md5=or_none(row[3]), + dois=dois, + cdx=cdx, + ) + dois = db.execute("SELECT doi FROM files_id_doi WHERE sha1=?", [sha1]) + print(json.dumps(obj)) + +if __name__=="__main__": + process_db(sys.argv[1]) |