diff options
Diffstat (limited to 'python')
-rw-r--r-- | python/refcat/report.py | 3 | ||||
-rw-r--r-- | python/refcat/tasks.py | 5 | ||||
-rw-r--r-- | python/setup.py | 2 |
3 files changed, 8 insertions, 2 deletions
diff --git a/python/refcat/report.py b/python/refcat/report.py index 4a0c219..aaa4544 100644 --- a/python/refcat/report.py +++ b/python/refcat/report.py @@ -55,6 +55,7 @@ class BrefDOIOnly(Refcat): def output(self): return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd) + # TODO: DOAJ subset # # (1) find all release idents with doaj id @@ -108,5 +109,3 @@ class BrefDOIOnly(Refcat): # $ zstdcat -T0 doi_refs.tsv.zst| pv -l | wc -l # 1.32G 0:06:33 [3.34M/s] [ <=> ]1315040677 # 1315040677 - - diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py index ebb5873..bbce44c 100644 --- a/python/refcat/tasks.py +++ b/python/refcat/tasks.py @@ -179,7 +179,9 @@ import os import sys import tempfile +import grobid_tei_xml import luigi +import requests from refcat.base import BaseTask, Zstd, shellout from refcat.settings import settings @@ -1530,8 +1532,10 @@ class BrefZipWikiDOI(Refcat): def output(self): return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd) + # Grobid reparse via grobid_tei_xml + class UnmatchedRefsReparse(Refcat): """ Reparse unmatched refs which have an unstructured field; about 190M/270M @@ -1568,6 +1572,7 @@ class UnmatchedRefsReparse(Refcat): def output(self): return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd) + # Wayback related, extract URL, query CDX. # # TODO: Make CDX lookup more, genenic, maybe a separate library or tool or mass diff --git a/python/setup.py b/python/setup.py index 587e269..6b3d57a 100644 --- a/python/setup.py +++ b/python/setup.py @@ -26,7 +26,9 @@ with open("README.md", "r") as fh: ]}, install_requires=[ "dynaconf[ini]", + "grobid_xml_tei", "luigi", + "requests", ], extras_require={"dev": [ "ipython", |