1 files changed, 99 insertions, 50 deletions
diff --git a/python/grobid_tool.py b/python/grobid_tool.py
index 2a1d8b5..3ffac98 100755
--- a/python/grobid_tool.py
+++ b/python/grobid_tool.py
@@ -1,21 +1,28 @@
 #!/usr/bin/env python3
-
 """
 These are generally for running one-off tasks from the command line. Output
 might go to stdout, or might go to Kafka topic.
 
 Example of large parallel run, locally:
 
-    cat /srv/sandcrawler/tasks/ungrobided.2019-09-23.json         | pv -l | parallel -j30 --pipe         ./grobid_tool.py --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --kafka-mode --grobid-host http://localhost:8070 -j0 extract-json -
+    cat /srv/sandcrawler/tasks/ungrobided.2019-09-23.json         | pv -l | parallel -j30 --pipe         ./grobid_tool.py --kafka-env prod --kafka-hosts wbgrp-svc350.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --kafka-mode --grobid-host http://localhost:8070 -j0 extract-json -
 """
 
-import sys
-import json
 import argparse
-import datetime
+import json
+import sys
+
+from grobid_tei_xml import parse_document_xml
 
-from grobid2json import teixml2json
 from sandcrawler import *
+from sandcrawler.grobid import CrossrefRefsWorker
+
+
+def run_single(args):
+    grobid_client = GrobidClient(host_url=args.grobid_host)
+    resp = grobid_client.process_fulltext(blob=args.pdf_file.read())
+    resp["_metadata"] = grobid_client.metadata(resp)
+    print(json.dumps(resp, sort_keys=True))
 
 
 def run_extract_json(args):
@@ -30,6 +37,7 @@ def run_extract_json(args):
         pusher = JsonLinePusher(worker, args.json_file)
     pusher.run()
 
+
 def run_extract_cdx(args):
     grobid_client = GrobidClient(host_url=args.grobid_host)
     wayback_client = WaybackClient()
@@ -40,7 +48,7 @@ def run_extract_cdx(args):
             multi_worker,
             args.cdx_file,
             filter_http_statuses=[200, 226],
-            filter_mimetypes=['application/pdf'],
+            filter_mimetypes=["application/pdf"],
             batch_size=args.jobs,
         )
     else:
@@ -49,10 +57,11 @@ def run_extract_cdx(args):
             worker,
             args.cdx_file,
             filter_http_statuses=[200, 226],
-            filter_mimetypes=['application/pdf'],
+            filter_mimetypes=["application/pdf"],
         )
     pusher.run()
 
+
 def run_extract_zipfile(args):
     grobid_client = GrobidClient(host_url=args.grobid_host)
     if args.jobs > 1:
@@ -65,6 +74,7 @@ def run_extract_zipfile(args):
         pusher = ZipfilePusher(worker, args.zip_file)
     pusher.run()
 
+
 def run_transform(args):
     grobid_client = GrobidClient()
     for line in args.json_file:
@@ -74,62 +84,101 @@ def run_transform(args):
         if args.metadata_only:
             out = grobid_client.metadata(line)
         else:
-            out = teixml2json(line['tei_xml'])
+            tei_doc = parse_document_xml(line["tei_xml"])
+            out = tei_doc.to_legacy_dict()
         if out:
-            if 'source' in line:
-                out['source'] = line['source']
+            if "source" in line:
+                out["source"] = line["source"]
             print(json.dumps(out))
 
 
+def run_parse_crossref_refs(args):
+    grobid_client = GrobidClient(host_url=args.grobid_host)
+    worker = CrossrefRefsWorker(grobid_client, sink=args.sink)
+    pusher = JsonLinePusher(worker, args.json_file)
+    pusher.run()
+
+
 def main():
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    parser.add_argument('--kafka-mode',
-        action='store_true',
-        help="send output to Kafka (not stdout)")
-    parser.add_argument('--kafka-hosts',
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument(
+        "--kafka-mode", action="store_true", help="send output to Kafka (not stdout)"
+    )
+    parser.add_argument(
+        "--kafka-hosts",
         default="localhost:9092",
-        help="list of Kafka brokers (host/port) to use")
-    parser.add_argument('--kafka-env',
-        default="dev",
-        help="Kafka topic namespace to use (eg, prod, qa, dev)")
-    parser.add_argument('-j', '--jobs',
-        default=8, type=int,
-        help="parallelism for batch CPU jobs")
-    parser.add_argument('--grobid-host',
-        default="http://grobid.qa.fatcat.wiki",
-        help="GROBID API host/port")
+        help="list of Kafka brokers (host/port) to use",
+    )
+    parser.add_argument(
+        "--kafka-env", default="dev", help="Kafka topic namespace to use (eg, prod, qa, dev)"
+    )
+    parser.add_argument(
+        "-j", "--jobs", default=8, type=int, help="parallelism for batch CPU jobs"
+    )
+    parser.add_argument(
+        "--grobid-host", default="https://grobid.qa.fatcat.wiki", help="GROBID API host/port"
+    )
     subparsers = parser.add_subparsers()
 
-    sub_extract_json = subparsers.add_parser('extract-json',
-        help="for each JSON line with CDX info, fetches PDF and does GROBID extraction")
+    sub_single = subparsers.add_parser("single")
+    sub_single.set_defaults(func=run_single)
+    sub_single.add_argument(
+        "pdf_file",
+        help="path to PDF file to process",
+        type=argparse.FileType("rb"),
+    )
+
+    sub_extract_json = subparsers.add_parser(
+        "extract-json",
+        help="for each JSON line with CDX info, fetches PDF and does GROBID extraction",
+    )
     sub_extract_json.set_defaults(func=run_extract_json)
-    sub_extract_json.add_argument('json_file',
+    sub_extract_json.add_argument(
+        "json_file",
         help="JSON file to import from (or '-' for stdin)",
-        type=argparse.FileType('r'))
+        type=argparse.FileType("r"),
+    )
 
-    sub_extract_cdx = subparsers.add_parser('extract-cdx',
-        help="for each CDX line, fetches PDF and does GROBID extraction")
+    sub_extract_cdx = subparsers.add_parser(
+        "extract-cdx", help="for each CDX line, fetches PDF and does GROBID extraction"
+    )
     sub_extract_cdx.set_defaults(func=run_extract_cdx)
-    sub_extract_cdx.add_argument('cdx_file',
+    sub_extract_cdx.add_argument(
+        "cdx_file",
         help="CDX file to import from (or '-' for stdin)",
-        type=argparse.FileType('r'))
+        type=argparse.FileType("r"),
+    )
 
-    sub_extract_zipfile = subparsers.add_parser('extract-zipfile',
-        help="opens zipfile, iterates over PDF files inside and does GROBID extract for each")
+    sub_extract_zipfile = subparsers.add_parser(
+        "extract-zipfile",
+        help="opens zipfile, iterates over PDF files inside and does GROBID extract for each",
+    )
     sub_extract_zipfile.set_defaults(func=run_extract_zipfile)
-    sub_extract_zipfile.add_argument('zip_file',
-        help="zipfile with PDFs to extract",
-        type=str)
-
-    sub_transform = subparsers.add_parser('transform')
+    sub_extract_zipfile.add_argument("zip_file", help="zipfile with PDFs to extract", type=str)
+
+    sub_parse_crossref_refs = subparsers.add_parser(
+        "parse-crossref-refs",
+        help="reads Crossref metadata records, parses any unstructured refs with GROBID",
+    )
+    sub_parse_crossref_refs.set_defaults(func=run_parse_crossref_refs)
+    sub_parse_crossref_refs.add_argument(
+        "json_file",
+        help="JSON-L file to process (or '-' for stdin)",
+        type=argparse.FileType("r"),
+    )
+
+    sub_transform = subparsers.add_parser("transform")
     sub_transform.set_defaults(func=run_transform)
-    sub_transform.add_argument('--metadata-only',
-        action='store_true',
-        help="Only pass through bibliographic metadata, not fulltext")
-    sub_transform.add_argument('json_file',
+    sub_transform.add_argument(
+        "--metadata-only",
+        action="store_true",
+        help="Only pass through bibliographic metadata, not fulltext",
+    )
+    sub_transform.add_argument(
+        "json_file",
         help="convert TEI-XML to JSON. Input is JSON lines with tei_xml field",
-        type=argparse.FileType('r'))
+        type=argparse.FileType("r"),
+    )
 
     args = parser.parse_args()
     if not args.__dict__.get("func"):
@@ -140,10 +189,10 @@ def main():
     if args.kafka_mode:
         produce_topic = "sandcrawler-{}.grobid-output-pg".format(args.kafka_env)
         print("Running in kafka output mode, publishing to {}\n".format(produce_topic))
-        args.sink = KafkaCompressSink(kafka_hosts=args.kafka_hosts,
-            produce_topic=produce_topic)
+        args.sink = KafkaCompressSink(kafka_hosts=args.kafka_hosts, produce_topic=produce_topic)
 
     args.func(args)
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     main()