aboutsummaryrefslogtreecommitdiffstats
path: root/python/refcat/tasks.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/refcat/tasks.py')
-rw-r--r--python/refcat/tasks.py5
1 files changed, 5 insertions, 0 deletions
diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py
index ebb5873..bbce44c 100644
--- a/python/refcat/tasks.py
+++ b/python/refcat/tasks.py
@@ -179,7 +179,9 @@ import os
import sys
import tempfile
+import grobid_tei_xml
import luigi
+import requests
from refcat.base import BaseTask, Zstd, shellout
from refcat.settings import settings
@@ -1530,8 +1532,10 @@ class BrefZipWikiDOI(Refcat):
def output(self):
return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd)
+
# Grobid reparse via grobid_tei_xml
+
class UnmatchedRefsReparse(Refcat):
"""
Reparse unmatched refs which have an unstructured field; about 190M/270M
@@ -1568,6 +1572,7 @@ class UnmatchedRefsReparse(Refcat):
def output(self):
return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd)
+
# Wayback related, extract URL, query CDX.
#
# TODO: Make CDX lookup more, genenic, maybe a separate library or tool or mass