From d2f14aa814f051e748f2702b48f43d6356e03a94 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Thu, 28 Oct 2021 14:46:58 +0200 Subject: tasks: reparse task --- python/refcat/tasks.py | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) (limited to 'python') diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py index 591acbd..ebb5873 100644 --- a/python/refcat/tasks.py +++ b/python/refcat/tasks.py @@ -1530,6 +1530,43 @@ class BrefZipWikiDOI(Refcat): def output(self): return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd) +# Grobid reparse via grobid_tei_xml + +class UnmatchedRefsReparse(Refcat): + """ + Reparse unmatched refs which have an unstructured field; about 190M/270M + unmatched, currently. We have more unmatched - these are only the ones + where we do not have a title. + """ + def requires(self): + return UnmatchedRefs() + + def run(self): + with self.output().open("w") as output: + with self.input().open() as f: + for line in f: + doc = json.loads(line) + if not "unstructured" in doc: + continue + unstructured = doc["unstructured"] + if len(unstructured) < 5: + continue + grobid_resp = requests.post( + "https://grobid.qa.fatcat.wiki/api/processCitation", + data={ + 'citations': unstructured, + 'consolidateCitations': 0, + }, + timeout=10.0, + ) + grobid_resp.raise_for_status() + citations = grobid_tei_xml.parse_citations_xml(grobid_resp.text) + if len(citations) == 0: + continue + output.write(json.dumps(citations[0].to_dict()) + "\n") + + def output(self): + return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd) # Wayback related, extract URL, query CDX. # -- cgit v1.2.3