diff options
-rw-r--r-- | python/refcat/tasks.py | 75 |
1 files changed, 62 insertions, 13 deletions
diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py index 894e25a..4b70b9f 100644 --- a/python/refcat/tasks.py +++ b/python/refcat/tasks.py @@ -240,16 +240,6 @@ class ReleaseExportExpanded(luigi.ExternalTask, Refcat): return luigi.LocalTarget(path=settings.RELEASE_EXPORT_EXPANDED_FILE, format=Zstd) -class MAGPapers(luigi.ExternalTask, Refcat): - """ - Microsoft Academic dump as archived, e.g. - https://archive.org/details/mag-2020-06-25 - we want this mainly for - comparisons. - """ - def output(self): - return luigi.LocalTarget(path=os.path.join(settings.MAG, "Papers.txt.gz"), format=Zstd) - - class WikipediaCitationsMinimalDataset(luigi.ExternalTask, Refcat): """ From https://archive.org/details/wikipedia_citations_2020-07-14 (Wikipedia @@ -302,6 +292,40 @@ class OpenLibraryAuthors(luigi.ExternalTask, Refcat): return luigi.LocalTarget(path=settings.OL_DUMP_AUTHORS, format=Zstd) +class MAGPapers(luigi.ExternalTask, Refcat): + """ + Microsoft Academic dump as archived, e.g. + https://archive.org/details/mag-2020-06-25 - we want this mainly for + comparisons. + """ + def output(self): + return luigi.LocalTarget(path=os.path.join(settings.MAG, "Papers.txt.gz"), format=Zstd) + + +class OpenCitations(luigi.ExternalTask, Refcat): + """ + OpenCitations distributes a zip file containing zip files containing files + with doi-doi lines. + + We prepare the raw file to have a single zstd compressed file to work with. + + Raw data looks like: + + oci,citing,cited,creation,timespan,journal_sc,author_sc + 02003080406360106010101060909370200010237070005020502-02001000106361937231430122422370200000837000737000200,10.3846/16111699.2012.705252,10.1016/j.neucom.2008.07.020,2012-10-04,P3Y0M,no,no + 02003080406360106010101060909370200010237070005020502-0200308040636010601016301060909370200000837093701080963010908,10.3846/16111699.2012.705252,10.3846/1611-1699.2008.9.189-198,2012-10-04,P4Y0M4D,yes,no + 02003080406360106010101060909370200010237070005020502-02001000106361937102818141224370200000737000237000003,10.3846/16111699.2012.705252,10.1016/j.asieco.2007.02.003,2012-10-04,P5Y6M,no,no + 02003080406360106010101060909370200010237070005020502-02003080406360106010101060909370200010137050505030808,10.3846/16111699.2012.705252,10.3846/16111699.2011.555388,2012-10-04,P1Y5M22D,yes,no + ... + + Combine, e.g. via: + + $ find . -name "*.csv" -exec cat {} + | grep -v '^oci,' | zstd -c -T0 > coci.csv.zst + """ + def output(self): + return luigi.LocalTarget(path=settings.COCI, format=Zstd) + + # ----8< Derivations # @@ -1140,10 +1164,12 @@ class Bref(Refcat): return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd) -# # Final Assembly # -------------- # +# Currently, "BrefCombined" is the result of the "Bref" matches and the raw +# refs. The joined dataset should be directly indexable into elasticsearch in +# fatcat_refs schema. class BrefSortedByWorkID(Refcat): @@ -1238,10 +1264,26 @@ class BrefCombined(Refcat): return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd) +# Stats from BrefCombined +# ======================= +# +# Calculate stats off the match result and other comparisons. +# +# TODO: # +# [ ] match status and reason freq table +# * [ ] [A] minimal source-target ident set (plus status, reason), sort by source ident +# * [ ] [B] fatcat db source ident plus ext id sorted by source ident +# * [ ] [C] turn [A] and [B] into a DOI to DOI match table (sorted by source doi) +# * [ ] [D] sort COCI by citing (or cited) +# * [ ] [E] compare COCI and "ASC" doi matches (as set ops, only COCI, only "ASC", etc + # Extra # ----- # +# Tinking with suffix arrays to pluck out journal names from abbreviations, etc. +# +# TODO: Be more principled, some stats on how many refs we could match this way. class UnmatchedRefsToRelease(Refcat): @@ -1309,7 +1351,11 @@ class UnmatchedResolveJournalNamesMapped(Refcat): return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd) -# Wikipedia related tasks +# Wikipedia related tasks; pages referencing papers we know about, e.g. +# Wiki_page -> target_release_ident. +# +# Using prepared datasets, just using DOI for the moment. +# TODO: use more than just DOI. class WikipediaDOI(Refcat): @@ -1358,7 +1404,10 @@ class BrefZipWikiDOI(Refcat): return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd) -# Wayback related +# Wayback related, extract URL, query CDX. +# +# TODO: Make CDX lookup more, genenic, maybe a separate library or tool or mass +# query utility via hadoop streaming or the like. class RefsURL(Refcat): |