summaryrefslogtreecommitdiffstats
path: root/python/fatcat_ingest.py
diff options
context:
space:
mode:
authorMartin Czygan <martin@archive.org>2020-11-19 22:36:55 +0000
committerMartin Czygan <martin@archive.org>2020-11-19 22:36:55 +0000
commit03eadfc7e2bee4213345f6464378e87b8f741d20 (patch)
tree3e5b13af8ba46b240f9ae53d5f522fb7ee02c219 /python/fatcat_ingest.py
parent5afde4690a4653db53fe4962af5da3eb9188d9a2 (diff)
parenta73b73c2944b3df2a62886c4e6b69c93f5e74222 (diff)
downloadfatcat-03eadfc7e2bee4213345f6464378e87b8f741d20.tar.gz
fatcat-03eadfc7e2bee4213345f6464378e87b8f741d20.zip
Merge branch 'bnewbold-xml-html-ingest' into 'master'
HTML webcapture ingest (and XML file ingest) See merge request webgroup/fatcat!88
Diffstat (limited to 'python/fatcat_ingest.py')
-rwxr-xr-xpython/fatcat_ingest.py4
1 files changed, 4 insertions, 0 deletions
diff --git a/python/fatcat_ingest.py b/python/fatcat_ingest.py
index 68676ad2..b9d71a7c 100755
--- a/python/fatcat_ingest.py
+++ b/python/fatcat_ingest.py
@@ -87,6 +87,7 @@ def _run_search_dump(args, search):
ingest_request = release_ingest_request(
release,
ingest_request_source="fatcat-ingest",
+ ingest_type=args.ingest_type,
)
if not ingest_request:
continue
@@ -214,6 +215,9 @@ def main():
parser.add_argument('--force-recrawl',
action='store_true',
help="Tell ingest worker to skip GWB history lookup and do SPNv2 crawl")
+ parser.add_argument('--ingest-type',
+ default="pdf",
+ help="What medium to ingest (pdf, xml, html)")
subparsers = parser.add_subparsers()
sub_container = subparsers.add_parser('container',