aboutsummaryrefslogtreecommitdiffstats
path: root/covid19_tool.py
blob: 6b84f69d51c70bbf7a631ec8ef35992ba13057c7 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
#!/usr/bin/env python3

"""
Wrapper CLI tool for invoking code in the `fatcat_covid19` module.

Licensed the same as code under fatcat_covid19/
"""

import sys
import argparse

from fatcat_covid19.parse import parse_cord19_file
from fatcat_covid19.query import query_fatcat
from fatcat_covid19.enrich import enrich_fatcat_file
from fatcat_covid19.dedupe import dedupe_file
from fatcat_covid19.derivatives import enrich_derivatives_file
from fatcat_covid19.transform import transform_es_file


def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.set_defaults(
        action='none',
    )
    subparsers = parser.add_subparsers()

    sub_parse_cord = subparsers.add_parser('parse-cord19',
        help="parse a CORD-19 CSV file into JSON")
    sub_parse_cord.set_defaults(
        action='parse-cord19',
    )
    sub_parse_cord.add_argument('csv_path',
        help="input CSV file path",
        type=str)
    sub_parse_cord.add_argument('--json-output',
        help="file to write to",
        type=argparse.FileType('w'),
        default=sys.stdout)

    sub_query_fatcat = subparsers.add_parser('query-fatcat',
        help="query fatcat search index for releases")
    sub_query_fatcat.set_defaults(
        action='query-fatcat',
    )
    sub_query_fatcat.add_argument('--json-output',
        help="file to write to",
        type=argparse.FileType('w'),
        default=sys.stdout)

    sub_dedupe = subparsers.add_parser('dedupe',
        help="emit only one JSON line per fatcat release_id")
    sub_dedupe.set_defaults(
        action='dedupe',
    )
    sub_dedupe.add_argument('--json-input',
        help="input JSON rows file (eg, CORD-19 parsed JSON)",
        type=argparse.FileType('r'),
        default=sys.stdin)
    sub_dedupe.add_argument('--json-output',
        help="file to write to",
        type=argparse.FileType('w'),
        default=sys.stdout)

    sub_enrich_fatcat = subparsers.add_parser('enrich-fatcat',
        help="lookup fatcat releases from JSON metadata")
    sub_enrich_fatcat.set_defaults(
        action='enrich-fatcat',
    )
    sub_enrich_fatcat.add_argument('json_file',
        help="input JSON rows file (eg, CORD-19 parsed JSON)",
        type=argparse.FileType('r'))
    sub_enrich_fatcat.add_argument('--json-output',
        help="file to write to",
        type=argparse.FileType('w'),
        default=sys.stdout)

    sub_enrich_derivatives = subparsers.add_parser('enrich-derivatives',
        help="enrich JSON rows with existing derivative files")
    sub_enrich_derivatives.set_defaults(
        action='enrich-derivatives',
    )
    sub_enrich_derivatives.add_argument('json_file',
        help="enriched (with fatcat_release) metadata file",
        type=argparse.FileType('r'))
    sub_enrich_derivatives.add_argument('--json-output',
        help="file to write ",
        type=argparse.FileType('w'),
        default=sys.stdout)
    sub_enrich_derivatives.add_argument('--base-dir',
        help="directory to look for files (in 'pdf' subdirectory)",
        default="fulltext_web")

    sub_transform_es = subparsers.add_parser('transform-es',
        help="transform fulltext JSON to elasticsearch schema JSON")
    sub_transform_es.set_defaults(
        action='transform-es',
    )
    sub_transform_es.add_argument('json_file',
        help="input JSON rows file (fulltext)",
        type=argparse.FileType('r'))
    sub_transform_es.add_argument('--json-output',
        help="file to write to",
        type=argparse.FileType('w'),
        default=sys.stdout)

    sub_webface = subparsers.add_parser('webface',
        help="run flask web interface")
    sub_webface.set_defaults(
        action='webface',
    )
    sub_webface.add_argument('--debug',
        action='store_true',
        help="enable debugging interface (note: not for everything)")
    sub_webface.add_argument('--host',
        default="127.0.0.1",
        help="listen on this host/IP")
    sub_webface.add_argument('--port',
        type=int,
        default=9119,
        help="listen on this port")

    args = parser.parse_args()

    if args.action == 'parse-cord19':
        parse_cord19_file(args.csv_path, args.json_output)
    elif args.action == 'query-fatcat':
        query_fatcat(args.json_output)
    elif args.action == 'dedupe':
        dedupe_file(args.json_input, args.json_output)
    elif args.action == 'enrich-fatcat':
        enrich_fatcat_file(args.json_file, args.json_output)
    elif args.action == 'enrich-derivatives':
        enrich_derivatives_file(args.json_file, args.json_output,
            args.base_dir)
    elif args.action == 'transform-es':
        transform_es_file(args.json_file, args.json_output)
    elif args.action == 'webface':
        # don't import until we use app; otherwise sentry exception reporting happens
        from fatcat_covid19.webface import app
        app.run(debug=args.debug, host=args.host, port=args.port)
    else:
        print("tell me what to do!")
        sys.exit(-1)


if __name__ == '__main__':
    main()