aboutsummaryrefslogtreecommitdiffstats
path: root/chocula/__main__.py
blob: 21f39766a14a2310e5fc940d5c8e8c3ec1eea795 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
#!/usr/bin/env python3

"""
Count Chocula - online serials metadata and stats

  "one, two, three, un-preserved web-native open-access long-tail indie
  journals, hah, hah, hah!"

  (yeah, I know, this name isn't very good)
  (see also: https://teamyacht.com/ernstchoukula.com/Ernst-Choukula.html)

Commands:

    everything
    init_db
    summarize
    export
    export_fatcat

    index_doaj
    index_road
    index_crossref
    index_entrez
    index_norwegian
    index_szczepanski
    index_ezb
    index_wikidata
    index_openapc
    index_sim

    load_fatcat_containers
    load_fatcat_stats
    load_homepage_status

    export_urls

Future commands:

    index_jurn
    index_datacite
    preserve_kbart --keeper SLUG
    preserve_sim

See TODO.md for more work-in-progress
"""

import sys
import csv
import argparse

from chocula import ChoculaDatabase, ChoculaConfig, IssnDatabase, ALL_CHOCULA_DIR_CLASSES


def run_everything(config, database):

    database.init_db()
    for cls in ALL_CHOCULA_DIR_CLASSES:
        loader = cls(config)
        counts = loader.index_file(database)
        print(counts)

    # XXX: TODO:
    database.load_fatcat_containers(config)
    database.load_fatcat_stats(config)
    # XXX: TODO:
    #self.preserve_kbart('lockss', LOCKSS_FILE)
    #self.preserve_kbart('clockss', CLOCKSS_FILE)
    #self.preserve_kbart('portico', PORTICO_FILE)
    #self.preserve_kbart('jstor', JSTOR_FILE)
    #self.preserve_sim(args)
    database.load_homepage_status(config)
    database.summarize()
    print("### Done with everything!")

def run_index(config, database, cls):
    loader = cls(config)
    counts = loader.index_file(database)
    print(counts)

def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    subparsers = parser.add_subparsers()

    parser.add_argument("--db-file",
        help="sqlite database file",
        default='chocula.sqlite',
        type=str)

    sub = subparsers.add_parser('everything',
        help="run all the commands")
    sub.set_defaults(func='everything')

    sub = subparsers.add_parser('init_db',
        help="create sqlite3 output file and tables")
    sub.set_defaults(func='init_db')

    sub = subparsers.add_parser('summarize',
        help="aggregate metadata from all tables into 'journals' table")
    sub.set_defaults(func='summarize')

    sub = subparsers.add_parser('export',
        help="dump JSON output")
    sub.set_defaults(func='export')

    sub = subparsers.add_parser('export_fatcat',
        help="dump JSON output in a format that can load into fatcat")
    sub.set_defaults(func='export_fatcat')

    for cls in ALL_CHOCULA_DIR_CLASSES:
        sub = subparsers.add_parser('index_{}'.format(cls.source_slug),
            help="load metadata from {}".format(cls.source_slug))
        sub.set_defaults(func='index_{}'.format(cls.source_slug), index_cls=cls)

    sub = subparsers.add_parser('load_fatcat_containers',
        help="load fatcat container metadata")
    sub.set_defaults(func='load_fatcat_containers')

    sub = subparsers.add_parser('load_fatcat_stats',
        help="update container-level stats from JSON file")
    sub.set_defaults(func='load_fatcat_stats')

    sub = subparsers.add_parser('export_urls',
        help="dump homepage URLs (eg, to crawl for status)")
    sub.set_defaults(func='export_urls')

    sub = subparsers.add_parser('load_homepage_status',
        help="import homepage URL crawl status")
    sub.set_defaults(func='load_homepage_status')

    args = parser.parse_args()
    if not args.__dict__.get("func"):
        print("tell me what to do! (try --help)")
        sys.exit(-1)

    config = ChoculaConfig.from_file()
    if args.func.startswith('index_') or args.func in ('everything','summarize',):
        issn_db = IssnDatabase(config.issnl.filepath)
    else:
        issn_db = None
    cdb = ChoculaDatabase(args.db_file, issn_db)
    if args.func == 'everything':
        run_everything(config, cdb)
    elif args.func.startswith('index_'):
        print(run_index(config, cdb, args.index_cls))
    elif args.func.startswith('load_'):
        func = getattr(cdb, args.func)
        print(func(config))
    else:
        func = getattr(cdb, args.func)
        print(func(), file=sys.stderr)

if __name__ == '__main__':
    main()