aboutsummaryrefslogtreecommitdiffstats
path: root/covid19_tool.py
blob: b9bea446d7456b130d6c9b0082fe5ca54a7e805e (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
#!/usr/bin/env python3

"""
Wrapper CLI tool for invoking code in the `fatcat_covid19` module.

Licensed the same as code under fatcat_covid19/
"""

import sys
import argparse

from fatcat_covid19.enrich import enrich_fatcat_file
from fatcat_covid19.derivatives import enrich_derivatives_file
from fatcat_covid19.transform import transform_es_file


def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.set_defaults(
        action='none',
    )
    subparsers = parser.add_subparsers()

    sub_webface = subparsers.add_parser('webface',
        help="run flask web interface")
    sub_webface.set_defaults(
        action='webface',
    )
    sub_webface.add_argument('--debug',
        action='store_true',
        help="enable debugging interface (note: not for everything)")
    sub_webface.add_argument('--host',
        default="127.0.0.1",
        help="listen on this host/IP")
    sub_webface.add_argument('--port',
        type=int,
        default=9119,
        help="listen on this port")

    sub_enrich_fatcat = subparsers.add_parser('enrich-fatcat',
        help="lookup fatcat releases from JSON metadata")
    sub_enrich_fatcat.set_defaults(
        action='enrich-fatcat',
    )
    sub_enrich_fatcat.add_argument('json_file',
        help="input JSON rows file (eg, CORD-19 parsed JSON)",
        type=argparse.FileType('r'))
    sub_enrich_fatcat.add_argument('--json-output',
        help="file to write to",
        type=argparse.FileType('w'),
        default=sys.stdout)

    sub_enrich_derivatives = subparsers.add_parser('enrich-derivatives',
        help="enrich JSON rows with existing derivative files")
    sub_enrich_derivatives.set_defaults(
        action='enrich-derivatives',
    )
    sub_enrich_derivatives.add_argument('json_file',
        help="enriched (with fatcat_release) metadata file",
        type=argparse.FileType('r'))
    sub_enrich_derivatives.add_argument('--json-output',
        help="file to write ",
        type=argparse.FileType('w'),
        default=sys.stdout)
    sub_enrich_derivatives.add_argument('--base-dir',
        help="directory to look for files (in 'pdf' subdirectory)",
        default="fulltext_web")

    sub_transform_es = subparsers.add_parser('transform-es',
        help="transform fulltext JSON to elasticsearch schema JSON")
    sub_transform_es.set_defaults(
        action='transform-es',
    )
    sub_transform_es.add_argument('json_file',
        help="input JSON rows file (fulltext)",
        type=argparse.FileType('r'))
    sub_transform_es.add_argument('--json-output',
        help="file to write to",
        type=argparse.FileType('w'),
        default=sys.stdout)

    args = parser.parse_args()

    if args.action == 'webface':
        # don't import until we use app; otherwise sentry exception reporting happens
        from fatcat_covid19.webface import app
        app.run(debug=args.debug, host=args.host, port=args.port)
    elif args.action == 'enrich-fatcat':
        enrich_fatcat_file(args.json_file, args.json_output)
    elif args.action == 'enrich-derivatives':
        enrich_derivatives_file(args.json_file, args.json_output,
            args.base_dir)
    elif args.action == 'transform-es':
        transform_es_file(args.json_file, args.json_output)
    else:
        print("tell me what to do!")
        sys.exit(-1)


if __name__ == '__main__':
    main()