aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
Diffstat (limited to 'python')
-rw-r--r--python/notes/version_3.md6
-rw-r--r--python/refcat/tasks.py24
2 files changed, 30 insertions, 0 deletions
diff --git a/python/notes/version_3.md b/python/notes/version_3.md
index 4f165f0..e4794a6 100644
--- a/python/notes/version_3.md
+++ b/python/notes/version_3.md
@@ -199,3 +199,9 @@ $ time zstdcat -T0 /magna/refcat/UnmatchedRefs/date-2021-02-20.json.zst | LC_ALL
260768384
```
+----
+
+# Wikipedia
+
+* /magna/data/wikipedia_citations_2020-07-14
+
diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py
index 4851f2b..df56b9d 100644
--- a/python/refcat/tasks.py
+++ b/python/refcat/tasks.py
@@ -177,6 +177,20 @@ class MAGPapers(luigi.ExternalTask, Refcat):
def output(self):
return luigi.LocalTarget(path=os.path.join(settings.MAG, "Papers.txt.gz"), format=Zstd)
+class WikipediaCitationsMinimalDataset(luigi.ExternalTask, Refcat):
+ """
+ From archive.org/details/wikipedia_citations_2020-07-14 (Wikipedia
+ Citations: A comprehensive dataset of citations with identifiers extracted
+ from English Wikipedia).
+
+ Dataset contains parquet, but we want JSON here:
+
+ $ parquet-tools cat --json minimal_dataset.parquet > minimal_dataset.json
+ """
+ def output(self):
+ return luigi.LocalTarget(path=os.path.join(settings.WIKIPEDIA_CITATIONS, "minimal_dataset.json"))
+
+
# ----8< Derivations
class RefsWithUnstructured(Refcat):
@@ -1412,3 +1426,13 @@ class MAGDOI(Refcat):
def output(self):
return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd)
+
+# ==== WikipediaCitations
+
+class BiblioRefWikipediaCitations(Refcat):
+ """
+ Generate a biblioref schema from wikipedia citations minimal file.
+ """
+
+ def requires(self):
+ return WikipediaCitationsMinimalDataset()