tasks: reparse task

author: Martin Czygan <martin.czygan@gmail.com> 2021-10-28 14:46:58 +0200
committer: Martin Czygan <martin.czygan@gmail.com> 2021-10-28 14:47:01 +0200
commit: d2f14aa814f051e748f2702b48f43d6356e03a94 (patch)
tree: de01339885a4ac8d5721957181125821cb66165c
parent: 1d0ba58bb0252b60893913bced24eb3369546a5f (diff)
download: refcat-d2f14aa814f051e748f2702b48f43d6356e03a94.tar.gz
refcat-d2f14aa814f051e748f2702b48f43d6356e03a94.zip
1 files changed, 37 insertions, 0 deletions
diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py
index 591acbd..ebb5873 100644
--- a/python/refcat/tasks.py
+++ b/python/refcat/tasks.py
@@ -1530,6 +1530,43 @@ class BrefZipWikiDOI(Refcat):
     def output(self):
         return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd)
 
+# Grobid reparse via grobid_tei_xml
+
+class UnmatchedRefsReparse(Refcat):
+    """
+    Reparse unmatched refs which have an unstructured field; about 190M/270M
+    unmatched, currently. We have more unmatched - these are only the ones
+    where we do not have a title.
+    """
+    def requires(self):
+        return UnmatchedRefs()
+
+    def run(self):
+        with self.output().open("w") as output:
+            with self.input().open() as f:
+                for line in f:
+                    doc = json.loads(line)
+                    if not "unstructured" in doc:
+                        continue
+                    unstructured = doc["unstructured"]
+                    if len(unstructured) < 5:
+                        continue
+                    grobid_resp = requests.post(
+                        "https://grobid.qa.fatcat.wiki/api/processCitation",
+                        data={
+                            'citations': unstructured,
+                            'consolidateCitations': 0,
+                        },
+                        timeout=10.0,
+                    )
+                    grobid_resp.raise_for_status()
+                    citations = grobid_tei_xml.parse_citations_xml(grobid_resp.text)
+                    if len(citations) == 0:
+                        continue
+                    output.write(json.dumps(citations[0].to_dict()) + "\n")
+
+    def output(self):
+        return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd)
 
 # Wayback related, extract URL, query CDX.
 #
author	Martin Czygan <martin.czygan@gmail.com>	2021-10-28 14:46:58 +0200
committer	Martin Czygan <martin.czygan@gmail.com>	2021-10-28 14:47:01 +0200
commit	d2f14aa814f051e748f2702b48f43d6356e03a94 (patch)
tree	de01339885a4ac8d5721957181125821cb66165c
parent	1d0ba58bb0252b60893913bced24eb3369546a5f (diff)
download	refcat-d2f14aa814f051e748f2702b48f43d6356e03a94.tar.gz refcat-d2f14aa814f051e748f2702b48f43d6356e03a94.zip