diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2021-10-28 14:46:58 +0200 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2021-10-28 14:47:01 +0200 |
commit | d2f14aa814f051e748f2702b48f43d6356e03a94 (patch) | |
tree | de01339885a4ac8d5721957181125821cb66165c | |
parent | 1d0ba58bb0252b60893913bced24eb3369546a5f (diff) | |
download | refcat-d2f14aa814f051e748f2702b48f43d6356e03a94.tar.gz refcat-d2f14aa814f051e748f2702b48f43d6356e03a94.zip |
tasks: reparse task
-rw-r--r-- | python/refcat/tasks.py | 37 |
1 files changed, 37 insertions, 0 deletions
diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py index 591acbd..ebb5873 100644 --- a/python/refcat/tasks.py +++ b/python/refcat/tasks.py @@ -1530,6 +1530,43 @@ class BrefZipWikiDOI(Refcat): def output(self): return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd) +# Grobid reparse via grobid_tei_xml + +class UnmatchedRefsReparse(Refcat): + """ + Reparse unmatched refs which have an unstructured field; about 190M/270M + unmatched, currently. We have more unmatched - these are only the ones + where we do not have a title. + """ + def requires(self): + return UnmatchedRefs() + + def run(self): + with self.output().open("w") as output: + with self.input().open() as f: + for line in f: + doc = json.loads(line) + if not "unstructured" in doc: + continue + unstructured = doc["unstructured"] + if len(unstructured) < 5: + continue + grobid_resp = requests.post( + "https://grobid.qa.fatcat.wiki/api/processCitation", + data={ + 'citations': unstructured, + 'consolidateCitations': 0, + }, + timeout=10.0, + ) + grobid_resp.raise_for_status() + citations = grobid_tei_xml.parse_citations_xml(grobid_resp.text) + if len(citations) == 0: + continue + output.write(json.dumps(citations[0].to_dict()) + "\n") + + def output(self): + return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd) # Wayback related, extract URL, query CDX. # |