aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2022-01-11 10:46:04 +0100
committerMartin Czygan <martin.czygan@gmail.com>2022-01-11 10:46:04 +0100
commita07ad381d1bc98d803c951a5088de70f6039393d (patch)
treee310bf43a7bca59475d2e8ead39573f51ef09b38 /python
parent5f9d05ff29816a330963196cdc9b54c1e6b1306d (diff)
downloadrefcat-a07ad381d1bc98d803c951a5088de70f6039393d.tar.gz
refcat-a07ad381d1bc98d803c951a5088de70f6039393d.zip
tasks: wikipedia update
Diffstat (limited to 'python')
-rw-r--r--python/refcat/tasks.py89
1 files changed, 86 insertions, 3 deletions
diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py
index b4d200d..3489cc1 100644
--- a/python/refcat/tasks.py
+++ b/python/refcat/tasks.py
@@ -281,11 +281,56 @@ class WikipediaCitationsMinimalDataset(luigi.ExternalTask, Refcat):
"ID_list": "{PMID=15944443, DOI=10.1093/molbev/msi185}"
}
-
+ An updated version: wikipedia-citations-enwiki-20211201, with better ID extraction.
"""
def output(self):
return luigi.LocalTarget(path=os.path.join(settings.WIKIPEDIA_CITATIONS, "minimal_dataset.json"))
+class WikipediaCitations20211201(luigi.ExternalTask, Refcat):
+ """
+ Update wikipedia citations dataset: https://archive.org/details/wikipedia-citations-enwiki-20211201
+
+ Example line:
+
+ {
+ "revision_id": 991003499,
+ "refs": [
+ {
+ "Authors": [
+ {
+ "first": "Liévin",
+ "last": "Ndayizeye"
+ },
+ {
+ "first": "Benoît",
+ "last": "Nzigidahera"
+ },
+ {
+ "first": "Abdelaziz Elamin",
+ "last": "Gesmallah"
+ }
+ ],
+ "CitationClass": "journal",
+ "Date": "2019-03-27",
+ "ID_list": {
+ "DOI": "10.1007/s42690-019-00013-w",
+ "ISSN": "1742-7592"
+ },
+ "Issue": "2",
+ "Pages": "125-130",
+ "Periodical": "International Journal of Tropical Insect Science",
+ "PublisherName": "Springer Science and Business Media LLC",
+ "Title": "Current distribution of Bactrocera latifrons Hendel in the different agro-ecological zones of Burundi",
+ "Volume": "39"
+ }
+ ],
+ "site_name": "enwiki",
+ "page_title": "List of Bactrocera species"
+ }
+ """
+ def output(self):
+ return luigi.LocalTarget(path=os.path.join(settings.WIKIPEDIA_CITATIONS_20211201,
+ "enwiki-20211201-pages-articles.citations.json"))
class OpenLibraryEditions(luigi.ExternalTask, Refcat):
"""
@@ -1517,6 +1562,37 @@ class WikipediaDOI(Refcat):
def output(self):
return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd)
+class Wikipedia20211201DOI(Refcat):
+ """
+ Updated wikipedia citations dataset.
+
+ {"type_of_citation":"cite journal","page_title":"Abortion in
+ Alabama","Title":"Why We Should Stop Using the Term \"Elective
+ Abortion\"","ID_list":"{PMID=30585581, DOI=10.1001/amajethics.2018.1175}"}
+ """
+ def requires(self):
+ return WikipediaCitations20211201()
+
+ def run(self):
+ with self.output().open("w") as output:
+ with self.input().open() as handle:
+ for line in handle:
+ doc = json.loads(line)
+ for i, ref in enumerate(doc.get("refs", [])):
+ if not "ID_list" in ref:
+ continue
+ if not "DOI" in ref["ID_list"]:
+ continue
+ doi = ref["ID_list"]["DOI"]
+ reduced = doc
+ reduced["refs"] = []
+ reduced["index"] = i
+ reduced["Title"] = ref.get("Title")
+ fields = [doi, doc["page_title"], json.dumps(reduced)]
+ output.write("\t".join(fields) + "\n")
+
+ def output(self):
+ return luigi.LocalTarget(path=self.path(ext="tsv"))
class BrefZipWikiDOI(Refcat):
"""
@@ -1526,17 +1602,24 @@ class BrefZipWikiDOI(Refcat):
"""
def requires(self):
return {
- "wiki": WikipediaDOI(),
+ # "wiki": WikipediaDOI(),
+ "wiki": Wikipedia20211201DOI(),
"fatcat": FatcatDOI(),
}
def run(self):
output = shellout("""
- skate-reduce -m wiki -W <(zstdcat -T0 {wiki}) -L <(zstdcat -T0 {fatcat}) |
+ skate-reduce -m wiki -W {wiki} -L <(zstdcat -T0 {fatcat}) |
zstd -T0 -c > {output}
""",
wiki=self.input().get("wiki").path,
fatcat=self.input().get("fatcat").path)
+ # output = shellout("""
+ # skate-reduce -m wiki -W <(zstdcat -T0 {wiki}) -L <(zstdcat -T0 {fatcat}) |
+ # zstd -T0 -c > {output}
+ # """,
+ # wiki=self.input().get("wiki").path,
+ # fatcat=self.input().get("fatcat").path)
luigi.LocalTarget(output).move(self.output().path)
def output(self):