aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-10-28 14:46:58 +0200
committerMartin Czygan <martin.czygan@gmail.com>2021-10-28 14:47:01 +0200
commitd2f14aa814f051e748f2702b48f43d6356e03a94 (patch)
treede01339885a4ac8d5721957181125821cb66165c
parent1d0ba58bb0252b60893913bced24eb3369546a5f (diff)
downloadrefcat-d2f14aa814f051e748f2702b48f43d6356e03a94.tar.gz
refcat-d2f14aa814f051e748f2702b48f43d6356e03a94.zip
tasks: reparse task
-rw-r--r--python/refcat/tasks.py37
1 files changed, 37 insertions, 0 deletions
diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py
index 591acbd..ebb5873 100644
--- a/python/refcat/tasks.py
+++ b/python/refcat/tasks.py
@@ -1530,6 +1530,43 @@ class BrefZipWikiDOI(Refcat):
def output(self):
return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd)
+# Grobid reparse via grobid_tei_xml
+
+class UnmatchedRefsReparse(Refcat):
+ """
+ Reparse unmatched refs which have an unstructured field; about 190M/270M
+ unmatched, currently. We have more unmatched - these are only the ones
+ where we do not have a title.
+ """
+ def requires(self):
+ return UnmatchedRefs()
+
+ def run(self):
+ with self.output().open("w") as output:
+ with self.input().open() as f:
+ for line in f:
+ doc = json.loads(line)
+ if not "unstructured" in doc:
+ continue
+ unstructured = doc["unstructured"]
+ if len(unstructured) < 5:
+ continue
+ grobid_resp = requests.post(
+ "https://grobid.qa.fatcat.wiki/api/processCitation",
+ data={
+ 'citations': unstructured,
+ 'consolidateCitations': 0,
+ },
+ timeout=10.0,
+ )
+ grobid_resp.raise_for_status()
+ citations = grobid_tei_xml.parse_citations_xml(grobid_resp.text)
+ if len(citations) == 0:
+ continue
+ output.write(json.dumps(citations[0].to_dict()) + "\n")
+
+ def output(self):
+ return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd)
# Wayback related, extract URL, query CDX.
#