diff options
Diffstat (limited to 'python')
-rw-r--r-- | python/refcat/tasks.py | 37 |
1 files changed, 37 insertions, 0 deletions
diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py index 591acbd..ebb5873 100644 --- a/python/refcat/tasks.py +++ b/python/refcat/tasks.py @@ -1530,6 +1530,43 @@ class BrefZipWikiDOI(Refcat): def output(self): return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd) +# Grobid reparse via grobid_tei_xml + +class UnmatchedRefsReparse(Refcat): + """ + Reparse unmatched refs which have an unstructured field; about 190M/270M + unmatched, currently. We have more unmatched - these are only the ones + where we do not have a title. + """ + def requires(self): + return UnmatchedRefs() + + def run(self): + with self.output().open("w") as output: + with self.input().open() as f: + for line in f: + doc = json.loads(line) + if not "unstructured" in doc: + continue + unstructured = doc["unstructured"] + if len(unstructured) < 5: + continue + grobid_resp = requests.post( + "https://grobid.qa.fatcat.wiki/api/processCitation", + data={ + 'citations': unstructured, + 'consolidateCitations': 0, + }, + timeout=10.0, + ) + grobid_resp.raise_for_status() + citations = grobid_tei_xml.parse_citations_xml(grobid_resp.text) + if len(citations) == 0: + continue + output.write(json.dumps(citations[0].to_dict()) + "\n") + + def output(self): + return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd) # Wayback related, extract URL, query CDX. # |