aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--python/README.md5
-rw-r--r--python/refcat/tasks.py14
2 files changed, 17 insertions, 2 deletions
diff --git a/python/README.md b/python/README.md
index f449477..071286c 100644
--- a/python/README.md
+++ b/python/README.md
@@ -93,3 +93,8 @@ OpenLibraryWorks
## Dependencies
![](notes/deps.png)
+
+
+## TODO
+
+* [ ] wrap up refcat
diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py
index 071184e..d774038 100644
--- a/python/refcat/tasks.py
+++ b/python/refcat/tasks.py
@@ -290,6 +290,12 @@ class WikipediaCitations20211201(luigi.ExternalTask, Refcat):
"""
Update wikipedia citations dataset: https://archive.org/details/wikipedia-citations-enwiki-20211201
+ Generated with https://github.com/dissemin/wikiciteparser.
+
+ * DOI, PMID, PMCID, arxiv-id, webarchive (prefix version)
+ * other identifier exact
+ * does not have some title cases
+
Example line:
{
@@ -1574,21 +1580,25 @@ class Wikipedia20211201DOI(Refcat):
return WikipediaCitations20211201()
def run(self):
- with tempfile.NamedTemporaryFile(delete=False) as tf:
+ with tempfile.NamedTemporaryFile(delete=False, mode="w") as tf:
with self.input().open() as handle:
for line in handle:
doc = json.loads(line)
+ if not doc["page_title"]:
+ continue
for i, ref in enumerate(doc.get("refs", [])):
if not "ID_list" in ref:
continue
if not "DOI" in ref["ID_list"]:
continue
- doi = ref["ID_list"]["DOI"]
+ doi = ref["ID_list"]["DOI"].strip()
reduced = doc
reduced["refs"] = []
reduced["index"] = i
reduced["Title"] = ref.get("Title")
fields = [doi, doc["page_title"], json.dumps(reduced)]
+ if not all(fields):
+ continue
tf.write("\t".join(fields) + "\n")
output = shellout("LC_ALL=C sort -S30% {input} > {output}", input=tf.name)
luigi.LocalTarget(output).move(self.output().path)