aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-08-02 14:22:57 +0200
committerMartin Czygan <martin.czygan@gmail.com>2021-08-02 14:22:57 +0200
commitc930b3f6db615b163a9b907b3855a0652522c545 (patch)
tree4548f570f3b64b1906fe68199f3df882eaeb24ad
parente532c94fb3fe1c7f0b428eccdecce9d689a47264 (diff)
downloadrefcat-c930b3f6db615b163a9b907b3855a0652522c545.tar.gz
refcat-c930b3f6db615b163a9b907b3855a0652522c545.zip
update notes and docs
-rw-r--r--python/refcat/tasks.py75
1 files changed, 62 insertions, 13 deletions
diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py
index 894e25a..4b70b9f 100644
--- a/python/refcat/tasks.py
+++ b/python/refcat/tasks.py
@@ -240,16 +240,6 @@ class ReleaseExportExpanded(luigi.ExternalTask, Refcat):
return luigi.LocalTarget(path=settings.RELEASE_EXPORT_EXPANDED_FILE, format=Zstd)
-class MAGPapers(luigi.ExternalTask, Refcat):
- """
- Microsoft Academic dump as archived, e.g.
- https://archive.org/details/mag-2020-06-25 - we want this mainly for
- comparisons.
- """
- def output(self):
- return luigi.LocalTarget(path=os.path.join(settings.MAG, "Papers.txt.gz"), format=Zstd)
-
-
class WikipediaCitationsMinimalDataset(luigi.ExternalTask, Refcat):
"""
From https://archive.org/details/wikipedia_citations_2020-07-14 (Wikipedia
@@ -302,6 +292,40 @@ class OpenLibraryAuthors(luigi.ExternalTask, Refcat):
return luigi.LocalTarget(path=settings.OL_DUMP_AUTHORS, format=Zstd)
+class MAGPapers(luigi.ExternalTask, Refcat):
+ """
+ Microsoft Academic dump as archived, e.g.
+ https://archive.org/details/mag-2020-06-25 - we want this mainly for
+ comparisons.
+ """
+ def output(self):
+ return luigi.LocalTarget(path=os.path.join(settings.MAG, "Papers.txt.gz"), format=Zstd)
+
+
+class OpenCitations(luigi.ExternalTask, Refcat):
+ """
+ OpenCitations distributes a zip file containing zip files containing files
+ with doi-doi lines.
+
+ We prepare the raw file to have a single zstd compressed file to work with.
+
+ Raw data looks like:
+
+ oci,citing,cited,creation,timespan,journal_sc,author_sc
+ 02003080406360106010101060909370200010237070005020502-02001000106361937231430122422370200000837000737000200,10.3846/16111699.2012.705252,10.1016/j.neucom.2008.07.020,2012-10-04,P3Y0M,no,no
+ 02003080406360106010101060909370200010237070005020502-0200308040636010601016301060909370200000837093701080963010908,10.3846/16111699.2012.705252,10.3846/1611-1699.2008.9.189-198,2012-10-04,P4Y0M4D,yes,no
+ 02003080406360106010101060909370200010237070005020502-02001000106361937102818141224370200000737000237000003,10.3846/16111699.2012.705252,10.1016/j.asieco.2007.02.003,2012-10-04,P5Y6M,no,no
+ 02003080406360106010101060909370200010237070005020502-02003080406360106010101060909370200010137050505030808,10.3846/16111699.2012.705252,10.3846/16111699.2011.555388,2012-10-04,P1Y5M22D,yes,no
+ ...
+
+ Combine, e.g. via:
+
+ $ find . -name "*.csv" -exec cat {} + | grep -v '^oci,' | zstd -c -T0 > coci.csv.zst
+ """
+ def output(self):
+ return luigi.LocalTarget(path=settings.COCI, format=Zstd)
+
+
# ----8< Derivations
#
@@ -1140,10 +1164,12 @@ class Bref(Refcat):
return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd)
-#
# Final Assembly
# --------------
#
+# Currently, "BrefCombined" is the result of the "Bref" matches and the raw
+# refs. The joined dataset should be directly indexable into elasticsearch in
+# fatcat_refs schema.
class BrefSortedByWorkID(Refcat):
@@ -1238,10 +1264,26 @@ class BrefCombined(Refcat):
return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd)
+# Stats from BrefCombined
+# =======================
+#
+# Calculate stats off the match result and other comparisons.
+#
+# TODO:
#
+# [ ] match status and reason freq table
+# * [ ] [A] minimal source-target ident set (plus status, reason), sort by source ident
+# * [ ] [B] fatcat db source ident plus ext id sorted by source ident
+# * [ ] [C] turn [A] and [B] into a DOI to DOI match table (sorted by source doi)
+# * [ ] [D] sort COCI by citing (or cited)
+# * [ ] [E] compare COCI and "ASC" doi matches (as set ops, only COCI, only "ASC", etc
+
# Extra
# -----
#
+# Tinking with suffix arrays to pluck out journal names from abbreviations, etc.
+#
+# TODO: Be more principled, some stats on how many refs we could match this way.
class UnmatchedRefsToRelease(Refcat):
@@ -1309,7 +1351,11 @@ class UnmatchedResolveJournalNamesMapped(Refcat):
return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd)
-# Wikipedia related tasks
+# Wikipedia related tasks; pages referencing papers we know about, e.g.
+# Wiki_page -> target_release_ident.
+#
+# Using prepared datasets, just using DOI for the moment.
+# TODO: use more than just DOI.
class WikipediaDOI(Refcat):
@@ -1358,7 +1404,10 @@ class BrefZipWikiDOI(Refcat):
return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd)
-# Wayback related
+# Wayback related, extract URL, query CDX.
+#
+# TODO: Make CDX lookup more, genenic, maybe a separate library or tool or mass
+# query utility via hadoop streaming or the like.
class RefsURL(Refcat):