From 7cc9e8685361117de5766cbd99c8dd56f0c28cc6 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Fri, 14 Jan 2022 00:46:23 +0100 Subject: tasks: fix Wikipedia20211201DOI --- python/README.md | 5 +++++ python/refcat/tasks.py | 14 ++++++++++++-- 2 files changed, 17 insertions(+), 2 deletions(-) (limited to 'python') diff --git a/python/README.md b/python/README.md index f449477..071286c 100644 --- a/python/README.md +++ b/python/README.md @@ -93,3 +93,8 @@ OpenLibraryWorks ## Dependencies ![](notes/deps.png) + + +## TODO + +* [ ] wrap up refcat diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py index 071184e..d774038 100644 --- a/python/refcat/tasks.py +++ b/python/refcat/tasks.py @@ -290,6 +290,12 @@ class WikipediaCitations20211201(luigi.ExternalTask, Refcat): """ Update wikipedia citations dataset: https://archive.org/details/wikipedia-citations-enwiki-20211201 + Generated with https://github.com/dissemin/wikiciteparser. + + * DOI, PMID, PMCID, arxiv-id, webarchive (prefix version) + * other identifier exact + * does not have some title cases + Example line: { @@ -1574,21 +1580,25 @@ class Wikipedia20211201DOI(Refcat): return WikipediaCitations20211201() def run(self): - with tempfile.NamedTemporaryFile(delete=False) as tf: + with tempfile.NamedTemporaryFile(delete=False, mode="w") as tf: with self.input().open() as handle: for line in handle: doc = json.loads(line) + if not doc["page_title"]: + continue for i, ref in enumerate(doc.get("refs", [])): if not "ID_list" in ref: continue if not "DOI" in ref["ID_list"]: continue - doi = ref["ID_list"]["DOI"] + doi = ref["ID_list"]["DOI"].strip() reduced = doc reduced["refs"] = [] reduced["index"] = i reduced["Title"] = ref.get("Title") fields = [doi, doc["page_title"], json.dumps(reduced)] + if not all(fields): + continue tf.write("\t".join(fields) + "\n") output = shellout("LC_ALL=C sort -S30% {input} > {output}", input=tf.name) luigi.LocalTarget(output).move(self.output().path) -- cgit v1.2.3