From da87108eecfd94e02d949a4fe4fc7998a489b934 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 1 Nov 2021 17:54:42 -0700 Subject: crossref persist: make GROBID ref parsing an option (not default) --- python/persist_tool.py | 6 ++++++ python/sandcrawler/persist.py | 23 ++++++++++++++++------- python/sandcrawler_worker.py | 13 +++++++++++-- 3 files changed, 33 insertions(+), 9 deletions(-) (limited to 'python') diff --git a/python/persist_tool.py b/python/persist_tool.py index a4f9812..5cf5776 100755 --- a/python/persist_tool.py +++ b/python/persist_tool.py @@ -126,6 +126,7 @@ def run_crossref(args): worker = PersistCrossrefWorker( db_url=args.db_url, grobid_client=grobid_client, + parse_refs=args.parse_refs, ) pusher = JsonLinePusher( worker, @@ -267,6 +268,11 @@ def main(): sub_crossref.add_argument( "--grobid-host", default="https://grobid.qa.fatcat.wiki", help="GROBID API host/port" ) + sub_crossref.add_argument( + "--parse-refs", + action="store_true", + help="use GROBID to parse any unstructured references (default is to not)", + ) args = parser.parse_args() if not args.__dict__.get("func"): diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py index 4c9d9d7..13b1232 100644 --- a/python/sandcrawler/persist.py +++ b/python/sandcrawler/persist.py @@ -676,7 +676,13 @@ class PersistHtmlTeiXmlWorker(GenericPersistDocWorker): class PersistCrossrefWorker(SandcrawlerWorker): - def __init__(self, db_url: str, grobid_client: Optional[GrobidClient], **kwargs): + def __init__( + self, + db_url: str, + grobid_client: Optional[GrobidClient], + parse_refs: bool = True, + **kwargs + ): super().__init__(**kwargs) self.db = SandcrawlerPostgresClient(db_url) self.cur = self.db.conn.cursor() @@ -684,6 +690,7 @@ class PersistCrossrefWorker(SandcrawlerWorker): self.grobid_client = grobid_client else: self.grobid_client = GrobidClient() + self.parse_refs = parse_refs def process(self, record: Any, key: Optional[str] = None) -> Any: """Only do batches (as transactions)""" @@ -702,7 +709,8 @@ class PersistCrossrefWorker(SandcrawlerWorker): record=record, ) ) - refs_batch.append(self.grobid_client.crossref_refs(record)) + if self.parse_refs: + refs_batch.append(self.grobid_client.crossref_refs(record)) resp = self.db.insert_crossref(self.cur, crossref_batch) if len(crossref_batch) < len(batch): @@ -710,11 +718,12 @@ class PersistCrossrefWorker(SandcrawlerWorker): self.counts["insert-crossref"] += resp[0] self.counts["update-crossref"] += resp[1] - resp = self.db.insert_grobid_refs(self.cur, refs_batch) - if len(refs_batch) < len(batch): - self.counts["skip"] += len(batch) - len(refs_batch) - self.counts["insert-grobid_refs"] += resp[0] - self.counts["update-grobid_refs"] += resp[1] + if refs_batch: + resp = self.db.insert_grobid_refs(self.cur, refs_batch) + if len(refs_batch) < len(batch): + self.counts["skip"] += len(batch) - len(refs_batch) + self.counts["insert-grobid_refs"] += resp[0] + self.counts["update-grobid_refs"] += resp[1] self.db.commit() return [] diff --git a/python/sandcrawler_worker.py b/python/sandcrawler_worker.py index 73bd444..c7eca86 100755 --- a/python/sandcrawler_worker.py +++ b/python/sandcrawler_worker.py @@ -298,12 +298,16 @@ def run_persist_ingest_file(args): def run_persist_crossref(args): grobid_client = GrobidClient(host_url=args.grobid_host) consume_topic = "fatcat-{}.api-crossref".format(args.env) - worker = PersistCrossrefWorker(db_url=args.db_url, grobid_client=grobid_client) + worker = PersistCrossrefWorker( + db_url=args.db_url, + grobid_client=grobid_client, + parse_refs=args.parse_refs, + ) pusher = KafkaJsonPusher( worker=worker, kafka_hosts=args.kafka_hosts, consume_topic=consume_topic, - group="persist-ingest", + group="persist-crossref", push_batches=True, # small batch size because doing GROBID processing batch_size=20, @@ -444,6 +448,11 @@ def main(): sub_persist_crossref.add_argument( "--grobid-host", default="https://grobid.qa.fatcat.wiki", help="GROBID API host/port" ) + sub_persist_crossref.add_argument( + "--parse-refs", + action="store_true", + help="use GROBID to parse any unstructured references (default is to not)", + ) sub_persist_crossref.set_defaults(func=run_persist_crossref) args = parser.parse_args() -- cgit v1.2.3