1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
|
#!/usr/bin/env python3
import sys
import json
import argparse
from http.server import HTTPServer
from sandcrawler.ingest_file import IngestFileRequestHandler, IngestFileWorker
from sandcrawler.ingest_fileset import IngestFilesetWorker
def run_single_ingest(args):
request = dict(
ingest_type=args.ingest_type,
base_url=args.url,
ext_ids=dict(doi=args.doi),
fatcat=dict(release_ident=args.release_id),
)
if args.force_recrawl:
request['force_recrawl'] = True
if request['ingest_type'] in ['dataset',]:
ingester = IngestFilesetWorker(
try_spn2=not args.no_spn2,
ingest_file_result_stdout=True,
)
else:
ingester = IngestFileWorker(
try_spn2=not args.no_spn2,
html_quick_mode=args.html_quick_mode,
)
result = ingester.process(request)
print(json.dumps(result, sort_keys=True))
return result
def run_requests(args):
# TODO: switch to using JsonLinePusher
file_worker = IngestFileWorker(
try_spn2=not args.no_spn2,
html_quick_mode=args.html_quick_mode,
)
fileset_worker = IngestFilesetWorker(
try_spn2=not args.no_spn2,
)
for l in args.json_file:
request = json.loads(l.strip())
if request['ingest_type'] in ['dataset',]:
result = fileset_worker.process(request)
else:
result = file_worker.process(request)
print(json.dumps(result, sort_keys=True))
def run_api(args):
port = 8083
print("Listening on localhost:{}".format(port))
server = HTTPServer(('', port), IngestFileRequestHandler)
server.serve_forever()
def main():
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
subparsers = parser.add_subparsers()
sub_single= subparsers.add_parser('single',
help="ingests a single base URL")
sub_single.set_defaults(func=run_single_ingest)
sub_single.add_argument('ingest_type',
default="pdf",
help="type of ingest (pdf, html, etc)")
sub_single.add_argument('--release-id',
help="(optional) existing release ident to match to")
sub_single.add_argument('--doi',
help="(optional) existing release DOI to match to")
sub_single.add_argument('--force-recrawl',
action='store_true',
help="ignore GWB history and use SPNv2 to re-crawl")
sub_single.add_argument('--no-spn2',
action='store_true',
help="don't use live web (SPNv2)")
sub_single.add_argument('--html-quick-mode',
action='store_true',
help="don't fetch individual sub-resources, just use CDX")
sub_single.add_argument('url',
help="URL of paper to fetch")
sub_requests = subparsers.add_parser('requests',
help="takes a series of ingest requests (JSON, per line) and runs each")
sub_requests.add_argument('--no-spn2',
action='store_true',
help="don't use live web (SPNv2)")
sub_requests.add_argument('--html-quick-mode',
action='store_true',
help="don't fetch individual sub-resources, just use CDX")
sub_requests.set_defaults(func=run_requests)
sub_requests.add_argument('json_file',
help="JSON file (request per line) to import from (or stdin)",
default=sys.stdin, type=argparse.FileType('r'))
sub_api = subparsers.add_parser('api',
help="starts a simple HTTP server that processes ingest requests")
sub_api.set_defaults(func=run_api)
sub_api.add_argument('--port',
help="HTTP port to listen on",
default=8033, type=int)
args = parser.parse_args()
if not args.__dict__.get("func"):
parser.print_help(file=sys.stderr)
sys.exit(-1)
args.func(args)
if __name__ == '__main__':
main()
|