diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2021-03-30 01:35:58 +0200 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2021-03-30 01:35:58 +0200 |
commit | e8ed1ff2a60b694b242669a50c5a37346f3b6d79 (patch) | |
tree | 28eeed2962722ff9c63cb7db80cf559b68fd3b97 /python | |
parent | 956476225837ad9ccf30c9698806e3fd959b75ef (diff) | |
download | refcat-e8ed1ff2a60b694b242669a50c5a37346f3b6d79.tar.gz refcat-e8ed1ff2a60b694b242669a50c5a37346f3b6d79.zip |
stub wikipedia converter
Diffstat (limited to 'python')
-rw-r--r-- | python/notes/version_3.md | 6 | ||||
-rw-r--r-- | python/refcat/tasks.py | 24 |
2 files changed, 30 insertions, 0 deletions
diff --git a/python/notes/version_3.md b/python/notes/version_3.md index 4f165f0..e4794a6 100644 --- a/python/notes/version_3.md +++ b/python/notes/version_3.md @@ -199,3 +199,9 @@ $ time zstdcat -T0 /magna/refcat/UnmatchedRefs/date-2021-02-20.json.zst | LC_ALL 260768384 ``` +---- + +# Wikipedia + +* /magna/data/wikipedia_citations_2020-07-14 + diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py index 4851f2b..df56b9d 100644 --- a/python/refcat/tasks.py +++ b/python/refcat/tasks.py @@ -177,6 +177,20 @@ class MAGPapers(luigi.ExternalTask, Refcat): def output(self): return luigi.LocalTarget(path=os.path.join(settings.MAG, "Papers.txt.gz"), format=Zstd) +class WikipediaCitationsMinimalDataset(luigi.ExternalTask, Refcat): + """ + From archive.org/details/wikipedia_citations_2020-07-14 (Wikipedia + Citations: A comprehensive dataset of citations with identifiers extracted + from English Wikipedia). + + Dataset contains parquet, but we want JSON here: + + $ parquet-tools cat --json minimal_dataset.parquet > minimal_dataset.json + """ + def output(self): + return luigi.LocalTarget(path=os.path.join(settings.WIKIPEDIA_CITATIONS, "minimal_dataset.json")) + + # ----8< Derivations class RefsWithUnstructured(Refcat): @@ -1412,3 +1426,13 @@ class MAGDOI(Refcat): def output(self): return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd) + +# ==== WikipediaCitations + +class BiblioRefWikipediaCitations(Refcat): + """ + Generate a biblioref schema from wikipedia citations minimal file. + """ + + def requires(self): + return WikipediaCitationsMinimalDataset() |