aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-11-01 17:54:42 -0700
committerBryan Newbold <bnewbold@archive.org>2021-11-04 17:19:52 -0700
commitda87108eecfd94e02d949a4fe4fc7998a489b934 (patch)
tree75b93602d3ec49615234ade1fb6d60abe0c21020
parent59af5ddd0a9587eaf53b4f6965c0d6290295ce55 (diff)
downloadsandcrawler-da87108eecfd94e02d949a4fe4fc7998a489b934.tar.gz
sandcrawler-da87108eecfd94e02d949a4fe4fc7998a489b934.zip
crossref persist: make GROBID ref parsing an option (not default)
-rwxr-xr-xpython/persist_tool.py6
-rw-r--r--python/sandcrawler/persist.py23
-rwxr-xr-xpython/sandcrawler_worker.py13
3 files changed, 33 insertions, 9 deletions
diff --git a/python/persist_tool.py b/python/persist_tool.py
index a4f9812..5cf5776 100755
--- a/python/persist_tool.py
+++ b/python/persist_tool.py
@@ -126,6 +126,7 @@ def run_crossref(args):
worker = PersistCrossrefWorker(
db_url=args.db_url,
grobid_client=grobid_client,
+ parse_refs=args.parse_refs,
)
pusher = JsonLinePusher(
worker,
@@ -267,6 +268,11 @@ def main():
sub_crossref.add_argument(
"--grobid-host", default="https://grobid.qa.fatcat.wiki", help="GROBID API host/port"
)
+ sub_crossref.add_argument(
+ "--parse-refs",
+ action="store_true",
+ help="use GROBID to parse any unstructured references (default is to not)",
+ )
args = parser.parse_args()
if not args.__dict__.get("func"):
diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py
index 4c9d9d7..13b1232 100644
--- a/python/sandcrawler/persist.py
+++ b/python/sandcrawler/persist.py
@@ -676,7 +676,13 @@ class PersistHtmlTeiXmlWorker(GenericPersistDocWorker):
class PersistCrossrefWorker(SandcrawlerWorker):
- def __init__(self, db_url: str, grobid_client: Optional[GrobidClient], **kwargs):
+ def __init__(
+ self,
+ db_url: str,
+ grobid_client: Optional[GrobidClient],
+ parse_refs: bool = True,
+ **kwargs
+ ):
super().__init__(**kwargs)
self.db = SandcrawlerPostgresClient(db_url)
self.cur = self.db.conn.cursor()
@@ -684,6 +690,7 @@ class PersistCrossrefWorker(SandcrawlerWorker):
self.grobid_client = grobid_client
else:
self.grobid_client = GrobidClient()
+ self.parse_refs = parse_refs
def process(self, record: Any, key: Optional[str] = None) -> Any:
"""Only do batches (as transactions)"""
@@ -702,7 +709,8 @@ class PersistCrossrefWorker(SandcrawlerWorker):
record=record,
)
)
- refs_batch.append(self.grobid_client.crossref_refs(record))
+ if self.parse_refs:
+ refs_batch.append(self.grobid_client.crossref_refs(record))
resp = self.db.insert_crossref(self.cur, crossref_batch)
if len(crossref_batch) < len(batch):
@@ -710,11 +718,12 @@ class PersistCrossrefWorker(SandcrawlerWorker):
self.counts["insert-crossref"] += resp[0]
self.counts["update-crossref"] += resp[1]
- resp = self.db.insert_grobid_refs(self.cur, refs_batch)
- if len(refs_batch) < len(batch):
- self.counts["skip"] += len(batch) - len(refs_batch)
- self.counts["insert-grobid_refs"] += resp[0]
- self.counts["update-grobid_refs"] += resp[1]
+ if refs_batch:
+ resp = self.db.insert_grobid_refs(self.cur, refs_batch)
+ if len(refs_batch) < len(batch):
+ self.counts["skip"] += len(batch) - len(refs_batch)
+ self.counts["insert-grobid_refs"] += resp[0]
+ self.counts["update-grobid_refs"] += resp[1]
self.db.commit()
return []
diff --git a/python/sandcrawler_worker.py b/python/sandcrawler_worker.py
index 73bd444..c7eca86 100755
--- a/python/sandcrawler_worker.py
+++ b/python/sandcrawler_worker.py
@@ -298,12 +298,16 @@ def run_persist_ingest_file(args):
def run_persist_crossref(args):
grobid_client = GrobidClient(host_url=args.grobid_host)
consume_topic = "fatcat-{}.api-crossref".format(args.env)
- worker = PersistCrossrefWorker(db_url=args.db_url, grobid_client=grobid_client)
+ worker = PersistCrossrefWorker(
+ db_url=args.db_url,
+ grobid_client=grobid_client,
+ parse_refs=args.parse_refs,
+ )
pusher = KafkaJsonPusher(
worker=worker,
kafka_hosts=args.kafka_hosts,
consume_topic=consume_topic,
- group="persist-ingest",
+ group="persist-crossref",
push_batches=True,
# small batch size because doing GROBID processing
batch_size=20,
@@ -444,6 +448,11 @@ def main():
sub_persist_crossref.add_argument(
"--grobid-host", default="https://grobid.qa.fatcat.wiki", help="GROBID API host/port"
)
+ sub_persist_crossref.add_argument(
+ "--parse-refs",
+ action="store_true",
+ help="use GROBID to parse any unstructured references (default is to not)",
+ )
sub_persist_crossref.set_defaults(func=run_persist_crossref)
args = parser.parse_args()