aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
Diffstat (limited to 'python')
-rw-r--r--python/conf/settings.ini5
-rw-r--r--python/refcat/tasks.py29
2 files changed, 34 insertions, 0 deletions
diff --git a/python/conf/settings.ini b/python/conf/settings.ini
index 4996374..e79153c 100644
--- a/python/conf/settings.ini
+++ b/python/conf/settings.ini
@@ -4,6 +4,9 @@
BASE = "/bigger/.cache"
TMPDIR = "/bigger/tmp"
+# Raw input file locations
+# ------------------------
+#
# The raw input containing a single reference per line and sha1 of compressed
# file.
#
@@ -34,3 +37,5 @@ REFS_FILE = "/bigger/scholar/fatcat_scholar_work_fulltext.refs.json.zst"
# Release docs from database export.
RELEASE_EXPORT_EXPANDED_FILE = "/bigger/citations/release_export_expanded.json.zst"
+# MAG directory.
+MAG = "/magna/data/mag-2020-06-25"
diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py
index c6faece..4851f2b 100644
--- a/python/refcat/tasks.py
+++ b/python/refcat/tasks.py
@@ -170,6 +170,13 @@ class ReleaseExportExpanded(luigi.ExternalTask, Refcat):
return luigi.LocalTarget(path=settings.RELEASE_EXPORT_EXPANDED_FILE, format=Zstd)
+class MAGPapers(luigi.ExternalTask, Refcat):
+ """
+ Microsoft Academic dump as archived, e.g. https://archive.org/details/mag-2020-06-25
+ """
+ def output(self):
+ return luigi.LocalTarget(path=os.path.join(settings.MAG, "Papers.txt.gz"), format=Zstd)
+
# ----8< Derivations
class RefsWithUnstructured(Refcat):
@@ -1383,3 +1390,25 @@ class RGSitemapFatcatSortedKeys(Refcat):
def output(self):
return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd)
+
+# ==== MAG
+
+class MAGDOI(Refcat):
+ """
+ List of MAG DOI.
+ """
+ def requires(self):
+ return MAGPapers()
+
+ def run(self):
+ output = shellout("""
+ unpigz -c {input} |
+ cut -f3 |
+ grep -v ^$ |
+ zstd -T0 -c > {output}
+ """,
+ input=self.input().path)
+ luigi.LocalTarget(output).move(self.output().path)
+
+ def output(self):
+ return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd)