From a07ad381d1bc98d803c951a5088de70f6039393d Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Tue, 11 Jan 2022 10:46:04 +0100 Subject: tasks: wikipedia update --- python/refcat/tasks.py | 89 ++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 86 insertions(+), 3 deletions(-) diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py index b4d200d..3489cc1 100644 --- a/python/refcat/tasks.py +++ b/python/refcat/tasks.py @@ -281,11 +281,56 @@ class WikipediaCitationsMinimalDataset(luigi.ExternalTask, Refcat): "ID_list": "{PMID=15944443, DOI=10.1093/molbev/msi185}" } - + An updated version: wikipedia-citations-enwiki-20211201, with better ID extraction. """ def output(self): return luigi.LocalTarget(path=os.path.join(settings.WIKIPEDIA_CITATIONS, "minimal_dataset.json")) +class WikipediaCitations20211201(luigi.ExternalTask, Refcat): + """ + Update wikipedia citations dataset: https://archive.org/details/wikipedia-citations-enwiki-20211201 + + Example line: + + { + "revision_id": 991003499, + "refs": [ + { + "Authors": [ + { + "first": "Liévin", + "last": "Ndayizeye" + }, + { + "first": "Benoît", + "last": "Nzigidahera" + }, + { + "first": "Abdelaziz Elamin", + "last": "Gesmallah" + } + ], + "CitationClass": "journal", + "Date": "2019-03-27", + "ID_list": { + "DOI": "10.1007/s42690-019-00013-w", + "ISSN": "1742-7592" + }, + "Issue": "2", + "Pages": "125-130", + "Periodical": "International Journal of Tropical Insect Science", + "PublisherName": "Springer Science and Business Media LLC", + "Title": "Current distribution of Bactrocera latifrons Hendel in the different agro-ecological zones of Burundi", + "Volume": "39" + } + ], + "site_name": "enwiki", + "page_title": "List of Bactrocera species" + } + """ + def output(self): + return luigi.LocalTarget(path=os.path.join(settings.WIKIPEDIA_CITATIONS_20211201, + "enwiki-20211201-pages-articles.citations.json")) class OpenLibraryEditions(luigi.ExternalTask, Refcat): """ @@ -1517,6 +1562,37 @@ class WikipediaDOI(Refcat): def output(self): return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd) +class Wikipedia20211201DOI(Refcat): + """ + Updated wikipedia citations dataset. + + {"type_of_citation":"cite journal","page_title":"Abortion in + Alabama","Title":"Why We Should Stop Using the Term \"Elective + Abortion\"","ID_list":"{PMID=30585581, DOI=10.1001/amajethics.2018.1175}"} + """ + def requires(self): + return WikipediaCitations20211201() + + def run(self): + with self.output().open("w") as output: + with self.input().open() as handle: + for line in handle: + doc = json.loads(line) + for i, ref in enumerate(doc.get("refs", [])): + if not "ID_list" in ref: + continue + if not "DOI" in ref["ID_list"]: + continue + doi = ref["ID_list"]["DOI"] + reduced = doc + reduced["refs"] = [] + reduced["index"] = i + reduced["Title"] = ref.get("Title") + fields = [doi, doc["page_title"], json.dumps(reduced)] + output.write("\t".join(fields) + "\n") + + def output(self): + return luigi.LocalTarget(path=self.path(ext="tsv")) class BrefZipWikiDOI(Refcat): """ @@ -1526,17 +1602,24 @@ class BrefZipWikiDOI(Refcat): """ def requires(self): return { - "wiki": WikipediaDOI(), + # "wiki": WikipediaDOI(), + "wiki": Wikipedia20211201DOI(), "fatcat": FatcatDOI(), } def run(self): output = shellout(""" - skate-reduce -m wiki -W <(zstdcat -T0 {wiki}) -L <(zstdcat -T0 {fatcat}) | + skate-reduce -m wiki -W {wiki} -L <(zstdcat -T0 {fatcat}) | zstd -T0 -c > {output} """, wiki=self.input().get("wiki").path, fatcat=self.input().get("fatcat").path) + # output = shellout(""" + # skate-reduce -m wiki -W <(zstdcat -T0 {wiki}) -L <(zstdcat -T0 {fatcat}) | + # zstd -T0 -c > {output} + # """, + # wiki=self.input().get("wiki").path, + # fatcat=self.input().get("fatcat").path) luigi.LocalTarget(output).move(self.output().path) def output(self): -- cgit v1.2.3