1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
|
#!/usr/bin/env python3
"""
Wrapper CLI tool for invoking code in the `fatcat_covid19` module.
Licensed the same as code under fatcat_covid19/
"""
import sys
import argparse
from fatcat_covid19.enrich import enrich_fatcat_file
from fatcat_covid19.derivatives import enrich_derivatives_file
from fatcat_covid19.transform import transform_es_file
def main():
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.set_defaults(
action='none',
)
subparsers = parser.add_subparsers()
sub_webface = subparsers.add_parser('webface',
help="run flask web interface")
sub_webface.set_defaults(
action='webface',
)
sub_webface.add_argument('--debug',
action='store_true',
help="enable debugging interface (note: not for everything)")
sub_webface.add_argument('--host',
default="127.0.0.1",
help="listen on this host/IP")
sub_webface.add_argument('--port',
type=int,
default=9119,
help="listen on this port")
sub_enrich_fatcat = subparsers.add_parser('enrich-fatcat',
help="lookup fatcat releases from JSON metadata")
sub_enrich_fatcat.set_defaults(
action='enrich-fatcat',
)
sub_enrich_fatcat.add_argument('json_file',
help="input JSON rows file (eg, CORD-19 parsed JSON)",
type=argparse.FileType('r'))
sub_enrich_fatcat.add_argument('--json-output',
help="file to write to",
type=argparse.FileType('w'),
default=sys.stdout)
sub_enrich_derivatives = subparsers.add_parser('enrich-derivatives',
help="enrich JSON rows with existing derivative files")
sub_enrich_derivatives.set_defaults(
action='enrich-derivatives',
)
sub_enrich_derivatives.add_argument('json_file',
help="enriched (with fatcat_release) metadata file",
type=argparse.FileType('r'))
sub_enrich_derivatives.add_argument('--json-output',
help="file to write ",
type=argparse.FileType('w'),
default=sys.stdout)
sub_enrich_derivatives.add_argument('--base-dir',
help="directory to look for files (in 'pdf' subdirectory)",
default="fulltext_web")
sub_transform_es = subparsers.add_parser('transform-es',
help="transform fulltext JSON to elasticsearch schema JSON")
sub_transform_es.set_defaults(
action='transform-es',
)
sub_transform_es.add_argument('json_file',
help="input JSON rows file (fulltext)",
type=argparse.FileType('r'))
sub_transform_es.add_argument('--json-output',
help="file to write to",
type=argparse.FileType('w'),
default=sys.stdout)
args = parser.parse_args()
if args.action == 'webface':
# don't import until we use app; otherwise sentry exception reporting happens
from fatcat_covid19.webface import app
app.run(debug=args.debug, host=args.host, port=args.port)
elif args.action == 'enrich-fatcat':
enrich_fatcat_file(args.json_file, args.json_output)
elif args.action == 'enrich-derivatives':
enrich_derivatives_file(args.json_file, args.json_output,
args.base_dir)
elif args.action == 'transform-es':
transform_es_file(args.json_file, args.json_output)
else:
print("tell me what to do!")
sys.exit(-1)
if __name__ == '__main__':
main()
|