update notes and docs

author: Martin Czygan <martin.czygan@gmail.com> 2021-08-02 14:22:57 +0200
committer: Martin Czygan <martin.czygan@gmail.com> 2021-08-02 14:22:57 +0200
commit: c930b3f6db615b163a9b907b3855a0652522c545 (patch)
tree: 4548f570f3b64b1906fe68199f3df882eaeb24ad /python
parent: e532c94fb3fe1c7f0b428eccdecce9d689a47264 (diff)
download: refcat-c930b3f6db615b163a9b907b3855a0652522c545.tar.gz
refcat-c930b3f6db615b163a9b907b3855a0652522c545.zip
1 files changed, 62 insertions, 13 deletions
diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py
index 894e25a..4b70b9f 100644
--- a/python/refcat/tasks.py
+++ b/python/refcat/tasks.py
@@ -240,16 +240,6 @@ class ReleaseExportExpanded(luigi.ExternalTask, Refcat):
         return luigi.LocalTarget(path=settings.RELEASE_EXPORT_EXPANDED_FILE, format=Zstd)
 
 
-class MAGPapers(luigi.ExternalTask, Refcat):
-    """
-    Microsoft Academic dump as archived, e.g.
-    https://archive.org/details/mag-2020-06-25 - we want this mainly for
-    comparisons.
-    """
-    def output(self):
-        return luigi.LocalTarget(path=os.path.join(settings.MAG, "Papers.txt.gz"), format=Zstd)
-
-
 class WikipediaCitationsMinimalDataset(luigi.ExternalTask, Refcat):
     """
     From https://archive.org/details/wikipedia_citations_2020-07-14 (Wikipedia
@@ -302,6 +292,40 @@ class OpenLibraryAuthors(luigi.ExternalTask, Refcat):
         return luigi.LocalTarget(path=settings.OL_DUMP_AUTHORS, format=Zstd)
 
 
+class MAGPapers(luigi.ExternalTask, Refcat):
+    """
+    Microsoft Academic dump as archived, e.g.
+    https://archive.org/details/mag-2020-06-25 - we want this mainly for
+    comparisons.
+    """
+    def output(self):
+        return luigi.LocalTarget(path=os.path.join(settings.MAG, "Papers.txt.gz"), format=Zstd)
+
+
+class OpenCitations(luigi.ExternalTask, Refcat):
+    """
+    OpenCitations distributes a zip file containing zip files containing files
+    with doi-doi lines.
+
+    We prepare the raw file to have a single zstd compressed file to work with.
+
+    Raw data looks like:
+
+    oci,citing,cited,creation,timespan,journal_sc,author_sc
+    02003080406360106010101060909370200010237070005020502-02001000106361937231430122422370200000837000737000200,10.3846/16111699.2012.705252,10.1016/j.neucom.2008.07.020,2012-10-04,P3Y0M,no,no
+    02003080406360106010101060909370200010237070005020502-0200308040636010601016301060909370200000837093701080963010908,10.3846/16111699.2012.705252,10.3846/1611-1699.2008.9.189-198,2012-10-04,P4Y0M4D,yes,no
+    02003080406360106010101060909370200010237070005020502-02001000106361937102818141224370200000737000237000003,10.3846/16111699.2012.705252,10.1016/j.asieco.2007.02.003,2012-10-04,P5Y6M,no,no
+    02003080406360106010101060909370200010237070005020502-02003080406360106010101060909370200010137050505030808,10.3846/16111699.2012.705252,10.3846/16111699.2011.555388,2012-10-04,P1Y5M22D,yes,no
+    ...
+
+    Combine, e.g. via:
+
+    $ find . -name "*.csv" -exec cat {} + | grep -v '^oci,' | zstd -c -T0 > coci.csv.zst
+    """
+    def output(self):
+        return luigi.LocalTarget(path=settings.COCI, format=Zstd)
+
+
 # ----8< Derivations
 
 #
@@ -1140,10 +1164,12 @@ class Bref(Refcat):
         return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd)
 
 
-#
 # Final Assembly
 # --------------
 #
+# Currently, "BrefCombined" is the result of the "Bref" matches and the raw
+# refs. The joined dataset should be directly indexable into elasticsearch in
+# fatcat_refs schema.
 
 
 class BrefSortedByWorkID(Refcat):
@@ -1238,10 +1264,26 @@ class BrefCombined(Refcat):
         return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd)
 
 
+# Stats from BrefCombined
+# =======================
+#
+# Calculate stats off the match result and other comparisons.
+#
+# TODO:
 #
+# [ ] match status and reason freq table
+# * [ ] [A] minimal source-target ident set (plus status, reason), sort by source ident
+# * [ ] [B] fatcat db source ident plus ext id sorted by source ident
+# * [ ] [C] turn [A] and [B] into a DOI to DOI match table (sorted by source doi)
+# * [ ] [D] sort COCI by citing (or cited)
+# * [ ] [E] compare COCI and "ASC" doi matches (as set ops, only COCI, only "ASC", etc
+
 # Extra
 # -----
 #
+# Tinking with suffix arrays to pluck out journal names from abbreviations, etc.
+#
+# TODO: Be more principled, some stats on how many refs we could match this way.
 
 
 class UnmatchedRefsToRelease(Refcat):
@@ -1309,7 +1351,11 @@ class UnmatchedResolveJournalNamesMapped(Refcat):
         return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd)
 
 
-# Wikipedia related tasks
+# Wikipedia related tasks; pages referencing papers we know about, e.g.
+# Wiki_page -> target_release_ident.
+#
+# Using prepared datasets, just using DOI for the moment.
+# TODO: use more than just DOI.
 
 
 class WikipediaDOI(Refcat):
@@ -1358,7 +1404,10 @@ class BrefZipWikiDOI(Refcat):
         return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd)
 
 
-# Wayback related
+# Wayback related, extract URL, query CDX.
+#
+# TODO: Make CDX lookup more, genenic, maybe a separate library or tool or mass
+# query utility via hadoop streaming or the like.
 
 
 class RefsURL(Refcat):
author	Martin Czygan <martin.czygan@gmail.com>	2021-08-02 14:22:57 +0200
committer	Martin Czygan <martin.czygan@gmail.com>	2021-08-02 14:22:57 +0200
commit	c930b3f6db615b163a9b907b3855a0652522c545 (patch)
tree	4548f570f3b64b1906fe68199f3df882eaeb24ad /python
parent	e532c94fb3fe1c7f0b428eccdecce9d689a47264 (diff)
download	refcat-c930b3f6db615b163a9b907b3855a0652522c545.tar.gz refcat-c930b3f6db615b163a9b907b3855a0652522c545.zip