aboutsummaryrefslogtreecommitdiffstats
path: root/python/manifest_converter.py
blob: f0d0bc775a555cec5a885030016042a7e9468216 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
#!/usr/bin/env python3
"""
Reads a sqlite3 manifest database (IA 2017 style) and outputs a stream of
"match" JSON objects which can be imported into fatcat with matched_import.py

This was used to convert this manifest:

    https://archive.org/details/ia_papers_manifest_2018-01-25/

to JSON format for fast fatcat importing.
"""

import sys
import json
import sqlite3
import itertools

# iterate over rows in files metadata...
# 1. select all identified DOIs
#   => filter based on count
# 2. select all file metadata
# 3. output object

def or_none(s):
    if s is None:
        return None
    elif type(s) == str and (len(s) == 0 or s == "\\N" or s == "-"):
        return None
    else:
        return s

def process_db(db_path):
    
    db = sqlite3.connect(db_path)

    for row in db.execute("SELECT sha1, mimetype, size_bytes, md5 FROM files_metadata"):
        sha1 = row[0]
        dois = db.execute("SELECT doi FROM files_id_doi WHERE sha1=?", [sha1]).fetchall()
        dois = [d[0] for d in dois]
        if len(dois) == 0:
            continue
        urls = db.execute("SELECT url, datetime FROM urls WHERE sha1=?", [sha1]).fetchall()
        if len(urls) == 0:
            continue
        cdx = [dict(url=row[0], dt=row[1]) for row in urls]
        obj = dict(
            sha1=sha1,
            mimetype=or_none(row[1]),
            size=(or_none(row[2]) and int(row[2])),
            md5=or_none(row[3]),
            dois=dois,
            cdx=cdx,
        )
        dois = db.execute("SELECT doi FROM files_id_doi WHERE sha1=?", [sha1])
        print(json.dumps(obj))

if __name__=="__main__":
    process_db(sys.argv[1])