aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-09-27 17:54:48 +0200
committerMartin Czygan <martin.czygan@gmail.com>2021-09-27 17:54:48 +0200
commit66e694b2575de3b71cd2fba455710d9eb6c4b722 (patch)
tree70ab6be7843833f7cdf7754f21d572dcf0f5b288
parentb86e4545487763be9542f446ab346cd1dad565cc (diff)
downloadrefcat-66e694b2575de3b71cd2fba455710d9eb6c4b722.tar.gz
refcat-66e694b2575de3b71cd2fba455710d9eb6c4b722.zip
python: cleanup code
-rw-r--r--python/Makefile7
-rw-r--r--python/refcat/attic.py1184
-rw-r--r--python/refcat/base.py2
-rw-r--r--python/refcat/cli.py29
-rw-r--r--python/refcat/report.py (renamed from python/refcat/techreport.py)5
-rw-r--r--python/refcat/utils.py31
6 files changed, 36 insertions, 1222 deletions
diff --git a/python/Makefile b/python/Makefile
index 6821aed..d686297 100644
--- a/python/Makefile
+++ b/python/Makefile
@@ -38,13 +38,14 @@ else
rsync -avP $^ ${DEPLOY_TARGET}
endif
-.PHONY: test
-test:
- pytest -v tests
+# .PHONY: test
+# test:
+# pytest -v tests
.PHONY: fmt
fmt:
yapf -p -i -r $(PKGNAME) tests
+ isort $(PKGNAME)
.PHONY: clean
clean:
diff --git a/python/refcat/attic.py b/python/refcat/attic.py
deleted file mode 100644
index 7633bab..0000000
--- a/python/refcat/attic.py
+++ /dev/null
@@ -1,1184 +0,0 @@
-#
-# dBBBBBb dBBBBBBP dBBBBBBP dBP dBBBP
-# BB
-# dBP BB dBP dBP dBP dBP
-# dBP BB dBP dBP dBP dBP
-# dBBBBBBB dBP dBP dBP dBBBBP
-
-
-class URLList(Refcat):
- """
- TSV URL extracted, 44368911.
- """
- def requires(self):
- return URLTabs()
-
- def run(self):
- stats = collections.Counter()
- with self.input().open("rb") as f:
- with self.output().open("w") as output:
- for i, line in enumerate(f, start=1):
- parts = line.decode("utf-8").strip().split("\t")
- if len(parts) != 3:
- stats["no-url"] += 1
- continue
- urls = extract_urls(parts[2])
- stats["found-{}".format(len(urls))] += 1
- for link in urls:
- link = link + "\n"
- output.write(link.encode("utf-8"))
- self.logger.debug(json.dumps(dict(stats)))
-
- def output(self):
- return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd)
-
-
-class RefsDOI(Refcat):
- """
- TSV with (ident, doi, full doc).
- """
- def requires(self):
- return RefsWithUnstructured()
-
- def run(self):
- """
- Note: we want the full JSON document, so we use jq tostring, which
- escapes "too much", hence we need to clean up with sed, unfortunately.
- """
- # XXX: skate-doi could be an awk function, too.
- # XXX: jq tostring might escape too much
- output = shellout(r"""
- zstdcat -T0 {input} |
- LC_ALL=C tr -d '\t' |
- parallel -j {n} --block 10M --pipe "jq -rc 'select(.biblio.doi != null) | [.release_ident, .biblio.doi, (.|tostring)] | @tsv'" |
- LC_ALL=C sed 's/\\\\/\\/g' |
- LC_ALL=C awk -F $'\t' -v OFS='\t' '$2=tolower($2)' |
- skate-to-doi -B -S -f 2 |
- LC_ALL=C sort -S 30% --parallel 6 -T {tmpdir} -k2,2 |
- zstd -c -T0 > {output}
- """,
- tmpdir=self.tmpdir,
- n=self.n,
- input=self.input().path)
- luigi.LocalTarget(output).move(self.output().path)
-
- def output(self):
- return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd)
-
-
-class RefsPMID(Refcat):
- """
- List of PMID, 74M refs seem to have one.
- """
- def requires(self):
- return RefsWithUnstructured()
-
- def run(self):
- output = shellout(r"""
- zstdcat -T0 {input} |
- parallel -j {n} --block 10M --pipe "jq -rc 'select(.biblio.pmid != null and .biblio.doi == null) | [.release_ident, .biblio.pmid, (.|tostring)] | @tsv'" |
- LC_ALL=C sed 's/\\\\/\\/g' |
- LC_ALL=C sort -S 30% -T {tmpdir} -k2,2 |
- zstd -T0 -c > {output}
- """,
- tmpdir=self.tmpdir,
- n=self.n,
- input=self.input().path)
- luigi.LocalTarget(output).move(self.output().path)
-
- def output(self):
- return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd)
-
-
-class RefsPMCID(Refcat):
- """
- List of PMCID.
- """
- def requires(self):
- return RefsWithUnstructured()
-
- def run(self):
- output = shellout(r"""
- zstdcat -T0 {input} |
- parallel -j {n} --block 10M --pipe "jq -rc 'select(.biblio.pmcid != null and .biblio.doi == null) | [.release_ident, .biblio.pmcid, (.|tostring)] | @tsv'" |
- LC_ALL=C sed 's/\\\\/\\/g' |
- LC_ALL=C sed -e 's@PMC@@g' |
- LC_ALL=C sort -S 30% -T {tmpdir} -k2,2 |
- zstd -T0 -c > {output}
- """,
- tmpdir=self.tmpdir,
- n=self.n,
- input=self.input().path)
- luigi.LocalTarget(output).move(self.output().path)
-
- def output(self):
- return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd)
-
-
-class RefsArxiv(Refcat):
- """
- List of arxiv ids from refs.
- """
- def requires(self):
- return RefsWithUnstructured()
-
- def run(self):
- output = shellout(r"""
- zstdcat -T0 {input} |
- parallel -j {n} --block 10M --pipe "jq -rc 'select(.biblio.arxiv_id != null and .biblio.doi == null) | [.release_ident, .biblio.arxiv_id, (.|tostring)] | @tsv'" |
- LC_ALL=C sed 's/\\\\/\\/g' |
- LC_ALL=C sort -S 30% -k2,2 -T {tmpdir} |
- zstd -T0 -c > {output}
- """,
- tmpdir=self.tmpdir,
- n=self.n,
- input=self.input().path)
- luigi.LocalTarget(output).move(self.output().path)
-
- def output(self):
- return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd)
-
-
-class RefsTitles(Refcat):
- """
- Extract titles.
-
- Contains many artifacts, e.g.: ! Accurate! and! efficient! insertional!
- RNA!editing!in!isolated!Physarum!mitochondria.!RNA*
- """
- def requires(self):
- return RefsWithUnstructured()
-
- def run(self):
- output = shellout(r"""
- zstdcat -T0 {input} |
- parallel -j {n} --block 10M --pipe "jq -rc 'select(.biblio.title != null and .biblio.doi == null) |
- [.release_ident, (.biblio.title | ltrimstr(\" \") | rtrimstr(\" \") | gsub(\"\\n\"; \" \"))] | @tsv'" |
- zstd -c -T0 > {output}
- """,
- input=self.input().path,
- n=self.n)
- luigi.LocalTarget(output).move(self.output().path)
-
- def output(self):
- return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd)
-
-
-class RefsTitlesLower(Refcat):
- """
- Unique lowercase titles; 223m46.443s.
- """
- def requires(self):
- return RefsTitles()
-
- def run(self):
- output = shellout("""
- zstdcat -T0 {input} |
- tr '[:upper:]' '[:lower:]' |
- LC_ALL=C sort -k2 |
- zstd -T0 -c > {output}
- """,
- input=self.input().path)
- luigi.LocalTarget(output).move(self.output().path)
-
- def output(self):
- return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd)
-
-
-class FatcatDOI(Refcat):
- """
- List of DOIs, lowercase on the fly.
- """
- def requires(self):
- return ReleaseExportReduced()
-
- def run(self):
- output = shellout(r"""
- zstdcat -T0 {input} |
- parallel -j {n} --block 10M --pipe "jq -rc 'select(.ext_ids.doi != null) | [.ident, .ext_ids.doi, (.|tostring)] | @tsv'" |
- LC_ALL=C sed 's/\\\\/\\/g' |
- LC_ALL=C awk -F $'\t' -v OFS='\t' '$2=tolower($2)' |
- LC_ALL=C sort -S 25% --parallel 6 -k2,2 -T {tmpdir} |
- zstd -c -T0 > {output}
- """,
- tmpdir=self.tmpdir,
- input=self.input().path,
- n=self.n)
- luigi.LocalTarget(output).move(self.output().path)
-
- def output(self):
- return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd)
-
-
-class FatcatPMID(Refcat):
- """
- List of PMID.
- """
- def requires(self):
- return ReleaseExportReduced()
-
- def run(self):
- output = shellout(r"""
- zstdcat -T0 {input} |
- parallel -j {n} --block 10M --pipe "jq -rc 'select(.ext_ids.pmid != null) | [.ident, .ext_ids.pmid, (.|tostring)] | @tsv'" |
- LC_ALL=C sed 's/\\\\/\\/g' |
- LC_ALL=C sort -S 30% -T {tmpdir} -k2,2 |
- zstd -c -T0 > {output}
- """,
- tmpdir=self.tmpdir,
- input=self.input().path,
- n=self.n)
- luigi.LocalTarget(output).move(self.output().path)
-
- def output(self):
- return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd)
-
-
-class FatcatPMCID(Refcat):
- """
- List of PMCID.
- """
- def requires(self):
- return ReleaseExportReduced()
-
- def run(self):
- output = shellout(r"""
- zstdcat -T0 {input} |
- parallel -j {n} --block 10M --pipe "jq -rc 'select(.ext_ids.pmcid != null) | [.ident, .ext_ids.pmcid, (.|tostring)] | @tsv'" |
- LC_ALL=C sed 's/\\\\/\\/g' |
- LC_ALL=C sed -e 's@PMC@@g' |
- LC_ALL=C sort -S 30% -T {tmpdir} -k2,2 |
- zstd -c -T0 > {output}
- """,
- tmpdir=self.tmpdir,
- input=self.input().path,
- n=self.n)
- luigi.LocalTarget(output).move(self.output().path)
-
- def output(self):
- return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd)
-
-
-class FatcatArxiv(Refcat):
- """
- List of arxiv ids.
- """
- def requires(self):
- return ReleaseExportReduced()
-
- def run(self):
- output = shellout(r"""
- zstdcat -T0 {input} |
- parallel -j {n} --block 10M --pipe "jq -rc 'select(.extra.arxiv.base_id != null) | [.ident, .extra.arxiv.base_id, (.|tostring)] | @tsv'" |
- LC_ALL=C sed 's/\\\\/\\/g' |
- LC_ALL=C sort -S 30% -k2,2 -T {tmpdir} |
- zstd -c -T0 > {output}""",
- tmpdir=self.tmpdir,
- input=self.input().path,
- n=self.n)
- luigi.LocalTarget(output).move(self.output().path)
-
- def output(self):
- return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd)
-
-
-class FatcatTitles(Refcat):
- """
- Get a list of non-normalized, sorted titles; ~104min.
- """
- def requires(self):
- return ReleaseExportReduced()
-
- def run(self):
- output = shellout(r"""
- zstdcat -T0 {input} |
- parallel -j {n} --block 10M --pipe "jq -rc 'select(.title != null and .biblio.doi == null) |
- [.ident, (.title | ltrimstr(\" \") | rtrimstr(\" \") | gsub(\"\\n\"; \" \"))] | @tsv'" |
- zstd -c -T0 > {output}
- """,
- input=self.input().path,
- n=self.n)
- luigi.LocalTarget(output).move(self.output().path)
-
- def output(self):
- return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd)
-
-
-class FatcatTitlesLower(Refcat):
- """
- Lowercase titles.
- """
- def requires(self):
- return FatcatTitles()
-
- def run(self):
- output = shellout("""
- zstdcat -T0 {input} |
- tr '[:upper:]' '[:lower:]' |
- LC_ALL=C sort -k2 |
- zstd -T0 -c > {output}
- """,
- input=self.input().path)
- luigi.LocalTarget(output).move(self.output().path)
-
- def output(self):
- return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd)
-
-
-class FatcatSortedKeys(Refcat):
- """
- Derive key and sort; key derivation (150M docs) took 39min; total 61min.
- """
- def requires(self):
- return ReleaseExportReduced()
-
- def run(self):
- output = shellout("""
- zstdcat -T0 {input} |
- skate-derive-key -b 50000 -verbose -f tsand |
- LC_ALL=C sort -k2,2 -S 35% --parallel 6 --compress-program pzstd -T {tmpdir} |
- zstd -T0 -c > {output}
- """,
- tmpdir=self.tmpdir,
- input=self.input().path)
- luigi.LocalTarget(output).move(self.output().path)
-
- def output(self):
- return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd)
-
-
-class CommonDOI(Refcat):
- """
- DOI that appear in the catalog and in the refs.
- """
- def requires(self):
- return {
- "fatcat": FatcatDOI(),
- "refs": RefsDOI(),
- }
-
- def run(self):
- f1 = shellout("zstdcat -T0 {fatcat} | cut -f2 > {output}", fatcat=self.input().get("fatcat").path)
- f2 = shellout("zstdcat -T0 {refs} | cut -f2 > {output}", refs=self.input().get("refs").path)
- output = shellout(""" LC_ALL=C comm {f1} {f2} | zstd -c > {output}""", f1=f1, f2=f2)
- luigi.LocalTarget(output).move(self.output().path)
- os.remove(f1)
- os.remove(f2)
-
- def output(self):
- return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd)
-
-
-class CommonTitles(Refcat):
- def requires(self):
- return {
- "fatcat": FatcatTitles(),
- "refs": RefsTitles(),
- }
-
- def run(self):
- f1 = shellout("zstdcat -T0 {fatcat} | cut -f2 > {output}", fatcat=self.input().get("fatcat"))
- f2 = shellout("zstdcat -T0 {refs} | cut -f2 > {output}", refs=self.input().get("refs"))
- output = shellout(""" LC_ALL=C comm -12 {f1} {f2} | zstd -c > {output}""", f1=f1, f2=f2)
- luigi.LocalTarget(output).move(self.output().path)
- os.remove(f1)
- os.remove(f2)
-
- def output(self):
- return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd)
-
-
-class CommonTitlesLower(Refcat):
- def requires(self):
- return {
- "fatcat": FatcatTitlesLower(),
- "refs": RefsTitlesLower(),
- }
-
- def run(self):
- f1 = shellout("zstdcat -T0 {fatcat} | cut -f2 > {output}", fatcat=self.input().get("fatcat").path)
- f2 = shellout("zstdcat -T0 {refs} | cut -f2 > {output}", refs=self.input().get("refs").path)
- output = shellout(""" comm -12 {f1} {f2} | zstd -c > {output}""", f1=f1, f2=f2)
- luigi.LocalTarget(output).move(self.output().path)
- os.remove(f1)
- os.remove(f2)
-
- def output(self):
- return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd)
-
-
-class RefsFatcatDOIJoin(Refcat):
- """
- Join fatcat and refs DOI lists.
-
- Output will be like:
-
- ---- DOI -------------- ------ Fatcat ----------- -------- Refs -------------
-
- 10.1001/2012.jama.10158 m7eoa3hbivcq5kgzzlepbifbna paygwq34z5hsnm5ypnwp2kz6wq
- 10.1001/2012.jama.10159 xsw5qtrv3jg7pjoj67e3kijtwq 4ug6jvnedbau3nnkhuqegepw2q
- 10.1001/2012.jama.10161 7m7yv5xkkjakxh3wuncqoctphe yllvkrxtgnhnfcyxwbj3swhegu
- 10.1001/2012.jama.10368 dw2djv2qdzecncwmh4o7esg4ie ghgshdzpbranbcwsr4xsh3yfhy
-
- To count the number of citations per DOI, count occurences on the second
- column.
-
- """
- def requires(self):
- return {
- "fatcat": FatcatDOI(),
- "refs": RefsDOI(),
- }
-
- def run(self):
- output = shellout("""
- LC_ALL=C join -1 2 -2 2 <(zstdcat -T0 {fatcat}) <(zstdcat -T0 {refs}) |
- zstd -T0 -c > {output}
- """,
- fatcat=self.input().get("fatcat").path,
- refs=self.input().get("refs").path)
- luigi.LocalTarget(output).move(self.output().path)
-
- def output(self):
- return luigi.LocalTarget(path=self.path(ext="tsv.zst"))
-
-
-class RefsFatcatPMIDJoin(Refcat):
- """
- Join fatcat and refs PMID lists.
- """
- def requires(self):
- return {
- "fatcat": FatcatPMID(),
- "refs": RefsPMID(),
- }
-
- def run(self):
- output = shellout("""
- LC_ALL=C join -1 2 -2 2 <(zstdcat -T0 {fatcat}) <(zstdcat -T0 {refs}) |
- zstd -c -T0 > {output}
- """,
- fatcat=self.input().get("fatcat").path,
- refs=self.input().get("refs").path)
- luigi.LocalTarget(output).move(self.output().path)
-
- def output(self):
- return luigi.LocalTarget(path=self.path(ext="tsv.zst"))
-
-
-class RefsFatcatPMCIDJoin(Refcat):
- """
- Join fatcat and refs PMCID lists.
- """
- def requires(self):
- return {
- "fatcat": FatcatPMCID(),
- "refs": RefsPMCID(),
- }
-
- def run(self):
- output = shellout("""
- LC_ALL=C join -1 2 -2 2 <(zstdcat -T0 {fatcat}) <(zstdcat -T0 {refs}) |
- zstd -c -T0 > {output}
- """,
- fatcat=self.input().get("fatcat").path,
- refs=self.input().get("refs").path)
- luigi.LocalTarget(output).move(self.output().path)
-
- def output(self):
- return luigi.LocalTarget(path=self.path(ext="tsv.zst"))
-
-
-class RefsFatcatArxivJoin(Refcat):
- """
- Join fatcat, refs on arxiv (base) id.
- """
- def requires(self):
- return {
- "fatcat": FatcatArxiv(),
- "refs": RefsArxiv(),
- }
-
- def run(self):
- # TODO: We want a zippy join here (e.g. to generate biblioref docs).
- output = shellout("""
- LC_ALL=C join -1 2 -2 2 <(zstdcat -T0 {fatcat}) <(zstdcat -T0 {refs}) |
- zstd -c -T0 > {output}
- """,
- fatcat=self.input().get("fatcat").path,
- refs=self.input().get("refs").path)
- luigi.LocalTarget(output).move(self.output().path)
-
- def output(self):
- return luigi.LocalTarget(path=self.path(ext="tsv.zst"))
-
-
-class RefsFatcatTitleLowerJoin(Refcat):
- """
- Join fatcat and refs titles.
-
- Output will be textfile (title, fatcat ident, refs ident). XXX: need to
- filter out too common titles first.
- """
- def requires(self):
- return {
- "fatcat": FatcatTitlesLower(),
- "refs": RefsTitlesLower(),
- }
-
- def run(self):
- output = shellout("""
- LC_ALL=C join -1 2 -2 2 {fatcat} {refs} > {output}
- """,
- fatcat=self.input().get("fatcat").path,
- refs=self.input().get("refs").path)
- luigi.LocalTarget(output).move(self.output().path)
-
- def output(self):
- return luigi.LocalTarget(path=self.path(ext="tsv"))
-
-
-class RefsFatcatGroupJoin(Refcat):
- """
- Concat joins.
-
- 10.1001/2012.jama.11274 of7donzkmrbiddbyrr4guqbzum nncja4imynb4rajadrlbnoklxy
- 10.1001/2012.jama.11274 of7donzkmrbiddbyrr4guqbzum noimcv5xdzd6hfqu2mebcrzr34
- 10.1001/2012.jama.11274 of7donzkmrbiddbyrr4guqbzum nqzg5lgdxvbhniy2hajlqd3aqi
- ...
- """
- def requires(self):
- return [RefsFatcatDOIJoin(), RefsFatcatPMIDJoin(), RefsFatcatArxivJoin(), RefsFatcatPMCIDJoin()]
-
- def run(self):
- _, tmpf = tempfile.mkstemp(prefix="refcat-")
- for target in self.input():
- shellout("""cat {file} >> {output}""", file=target.path, output=tmpf)
- luigi.LocalTarget(tmpf).move(self.output().path)
-
- def output(self):
- return luigi.LocalTarget(path=self.path(ext="tsv.zst"))
-
-
-class RefsFatcatRanked(Refcat):
- """
- Inbound count, ident; 32m34.142s.
-
- 15175 ui64apmob5gnrfwe7pwgk7egju
- 15167 cejzj3ddszcdrmij7np36am5fa
- 15165 2b2ok43pirduva7ai3745k5xa4
- 15158 cn4c33ctb5g5fax3touxjdmfle
- 15155 rrlbmbro4rhwri3zawz3uhp5va
- 15138 o62kjogy4zdyrlvy7cu7rlcs3m
- """
- def requires(self):
- return RefsFatcatGroupJoin()
-
- def run(self):
- output = shellout("""
- zstdcat -T0 {file} |
- LC_ALL=C sort -k2,3 -u |
- LC_ALL=C cut -d ' ' -f 2 |
- LC_ALL=C uniq -c |
- LC_ALL=C sort -nr > {output}
- """,
- file=self.input().path)
- luigi.LocalTarget(output).move(self.output().path)
-
- def output(self):
- return luigi.LocalTarget(path=self.path(ext="tsv"))
-
-
-#
-#
-# # TODO: merge refs docs and release docs, maybe add an source label, then
-# # cluster; run verify and report on the number of similar records; generate a list of common titles
-# #
-# # TODO: find non-matched items and check for any pattern
-#
-#
-class RefsCounter(Refcat):
- """
- Key counts, see: ref_counter.py.
- """
- def requires(self):
- return RefsWithUnstructured()
-
- def run(self):
- counts = collections.Counter()
- with self.input().open("r") as f:
- for i, line in enumerate(f):
- obj = json.loads(line)
- counts['total'] += 1
- for k in obj.keys():
- if k == 'biblio':
- continue
- elif k == 'ref_source':
- counts["source_" + obj[k]] += 1
- elif obj.get(k):
- counts["has_" + k] += 1
- biblio = obj.get('biblio')
- if not biblio:
- continue
- for k in biblio.keys():
- if biblio.get(k):
- counts["has_" + k] += 1
- if biblio.get('doi') or biblio.get('pmcid') or biblio.get('pmid') or biblio.get('arxiv_id'):
- counts['has_any_extid'] += 1
- if biblio.get('container_name') and biblio.get('volume') and biblio.get('issue') and biblio.get(
- 'pages'):
- counts['has_container_volume_issue_pages'] += 1
- if biblio.get('title') and biblio.get('contrib_raw_names') and biblio.get('year'):
- counts['has_title_contrib_year'] += 1
- if biblio.get('container_name') and biblio.get('contrib_raw_names') and biblio.get('year'):
- counts['has_contrib_container_year'] += 1
- if biblio.get('title') and biblio.get('container_name') and biblio.get('year'):
- counts['has_title_container_year'] += 1
-
- if i % 1000000 == 0:
- print(json.dumps(counts, indent=4, sort_keys=True), file=sys.stderr)
-
- with self.output().open("w") as output:
- json.dump(counts, output)
-
- def output(self):
- return luigi.LocalTarget(path=self.path(ext="json"))
-
-
-class RefsKeyStats(Refcat):
- """
- How many titles, DOI, etc. do we have in refs?
- """
- def requires(self):
- return RefsWithUnstructured()
-
- def run(self):
- stats = {
- "total": 0,
- "no_biblio": 0,
- "stats": collections.Counter(),
- }
- with self.input().open("r") as f:
- for i, line in enumerate(f):
- stats["total"] += 1
- doc = json.loads(line)
- if "biblio" not in doc:
- stats["no_biblio"] += 1
- continue
- biblio = doc["biblio"]
- key = "|".join(sorted(biblio.keys()))
- stats["stats"][key] += 1
- if i % 1000000 == 0:
- print(json.dumps(stats, indent=4, sort_keys=True), file=sys.stderr)
-
- with self.output().open("w") as output:
- json.dumps(stats, output)
-
- def output(self):
- return luigi.LocalTarget(path=self.path(ext="json"))
-
-
-class RefsToRelease(Refcat):
- """
- Convert a refs doc into a minimalistic release entity. Requires "skate"
- tools - XXX: polish.
- """
- def requires(self):
- return RefsWithUnstructured()
-
- def run(self):
- output = shellout("""
- zstdcat -T0 {input} |
- skate-conv -f ref -w 24 -b 100000 |
- zstd -T0 -c > {output}
- """,
- input=self.input().path)
- luigi.LocalTarget(output).move(self.output().path)
-
- def output(self):
- return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd)
-
-
-class RefsSortedKeys(Refcat):
- """
- Derive key and sort; 1.8B json docs, took: 255min; 122k/s; key extration
- almost 3h (might be faster with rust); 90G compressed.
-
- Keys based on title will have many empty keys; e.g. "2021-02-20",
- 838,057,412 docs have no key.
- """
- def requires(self):
- return RefsToRelease()
-
- def run(self):
- output = shellout("""
- zstdcat -T0 {input} |
- skate-derive-key -skip-empty-keys -b 50000 -verbose -f tsand |
- LC_ALL=C sort -k2,2 -S 35% --parallel 6 --compress-program pzstd -T {tmpdir} |
- zstd -T0 -c > {output}
- """,
- tmpdir=self.tmpdir,
- input=self.input().path)
- luigi.LocalTarget(output).move(self.output().path)
-
- def output(self):
- return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd)
-
-
-class RefsReleasesMerged(Refcat):
- """
- Merge release and refs (in release form).
-
- wc: 1579687186 53137849922 913692185284
- """
- def requires(self):
- return {
- "release": ReleaseExportReduced(),
- "refs": RefsToRelease(),
- }
-
- def run(self):
- _, f = tempfile.mkstemp(prefix="refcat-")
- for k, v in self.input().items():
- shellout("cat {input} >> {output}", input=v.path, output=f)
- luigi.LocalTarget(f).move(self.output().path)
-
- def output(self):
- return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd)
-
-
-class RefsTitleFrequency(Refcat):
- """
- Dig into common titles.
- """
- tmpdir = luigi.Parameter(default="/fast/tmp", description="set tempdir", significant=False)
-
- def requires(self):
- return RefsTitlesLower()
-
- def run(self):
- output = shellout("""
- zstdcat -T0 {input} |
- LC_ALL=C cut -f2 |
- LC_ALL=C sort -T {tmpdir} -S20% --compress-program pzstd --parallel 6 |
- LC_ALL=C uniq -c |
- LC_ALL=C sort -nr |
- zstd -c9 > {output}
- """,
- tmpdir=self.tmpdir,
- input=self.input().path)
- luigi.LocalTarget(output).move(self.output().path)
-
- def output(self):
- return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd)
-
-
-# # XXX: After RefsReleasesMerged, we want to cluster.
-# # python -m fuzzycat cluster -t tsandcrawler < data/re.json > cluster.json.zst
-# #
-# # Note: First run with no compression filled the disk, add zstd to fuzzycat.
-
-
-class RefsFatcatSortedKeys(Refcat):
- """
- Extract keys and sort.
- """
- def requires(self):
- return RefsReleasesMerged()
-
- def run(self):
- output = shellout("""
- zstdcat -T0 {input} |
- skate-derive-key -skip-empty-keys -b 50000 -verbose -f tsand |
- LC_ALL=C sort -k2,2 -S 35% --parallel 6 --compress-program pzstd -T {tmpdir} |
- zstd -T0 -c > {output}
- """,
- tmpdir=self.tmpdir,
- input=self.input().path)
- luigi.LocalTarget(output).move(self.output().path)
-
- def output(self):
- return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd)
-
-
-class RefsFatcatClusters(Refcat):
- """
- Group by clusters. Full set will be ~90GB compressed, about 40M clusters
- (already filtered, so 2+ docs only, with at least on ref and one release, etc).
- """
- def requires(self):
- return RefsFatcatSortedKeys()
-
- def run(self):
- output = shellout("""
- zstdcat -T0 {input} |
- skate-cluster -both |
- zstd -T0 -c9 > {output}
- """,
- tmpdir=self.tmpdir,
- input=self.input().path)
- luigi.LocalTarget(output).move(self.output().path)
-
- def output(self):
- return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd)
-
-
-# ==== new style zippy biblioref generation
-
-
-class BiblioRefFromFuzzyClusters(Refcat):
- """
- Use "bref" mode to generate a biblioref document from verified clusters.
- """
- def requires(self):
- return RefsFatcatClusters()
-
- def run(self):
- output = shellout("""
- zstdcat -T0 {input} |
- skate-verify -m bref > {output}
- """,
- input=self.input().path)
- luigi.LocalTarget(output).move(self.output().path)
-
- def output(self):
- return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd)
-
-
-class BiblioRefZippyDOI(Refcat):
- """
- Generate proposed biblioref docs from two sorted key files, sorted by DOI.
- """
- def requires(self):
- return {
- "refs": RefsDOI(),
- "releases": FatcatDOI(),
- }
-
- def run(self):
- output = shellout(r"""
- skate-verify -m exact -r doi -R <(zstdcat -T0 {releases}) -F <(zstdcat -T0 {refs}) |
- zstd -c -T0 > {output}
- """,
- releases=self.input().get("releases").path,
- refs=self.input().get("refs").path)
- luigi.LocalTarget(output).move(self.output().path)
-
- def output(self):
- return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd)
-
-
-class BiblioRefZippyArxiv(Refcat):
- """
- Generate proposed biblioref docs from two sorted key files, sorted by DOI.
- """
- def requires(self):
- return {
- "refs": RefsArxiv(),
- "releases": FatcatArxiv(),
- }
-
- def run(self):
- output = shellout(r"""
- skate-verify -m exact -r arxiv -R <(zstdcat -T0 {releases}) -F <(zstdcat -T0 {refs}) |
- zstd -c -T0 > {output}
- """,
- releases=self.input().get("releases").path,
- refs=self.input().get("refs").path)
- luigi.LocalTarget(output).move(self.output().path)
-
- def output(self):
- return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd)
-
-
-class BiblioRefZippyPMID(Refcat):
- """
- Generate proposed biblioref docs from two sorted key files, sorted by DOI.
- """
- def requires(self):
- return {
- "refs": RefsPMID(),
- "releases": FatcatPMID(),
- }
-
- def run(self):
- output = shellout(r"""
- skate-verify -m exact -r pmid -R <(zstdcat -T0 {releases}) -F <(zstdcat -T0 {refs}) |
- zstd -c -T0 > {output}
- """,
- releases=self.input().get("releases").path,
- refs=self.input().get("refs").path)
- luigi.LocalTarget(output).move(self.output().path)
-
- def output(self):
- return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd)
-
-
-class BiblioRefZippyPMCID(Refcat):
- """
- Generate proposed biblioref docs from two sorted key files, sorted by DOI.
- """
- def requires(self):
- return {
- "refs": RefsPMCID(),
- "releases": FatcatPMCID(),
- }
-
- def run(self):
- output = shellout(r"""
- skate-verify -m exact -r pmcid -R <(zstdcat -T0 {releases}) -F <(zstdcat -T0 {refs}) |
- zstd -c -T0 > {output}
- """,
- releases=self.input().get("releases").path,
- refs=self.input().get("refs").path)
- luigi.LocalTarget(output).move(self.output().path)
-
- def output(self):
- return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd)
-
-
-class BiblioRefV2(Refcat):
- """
- A v1 set of biblioref schema docs.
- """
- def requires(self):
- return [
- BiblioRefZippyDOI(),
- BiblioRefZippyArxiv(),
- BiblioRefZippyPMID(),
- BiblioRefZippyPMCID(),
- BiblioRefFromFuzzyClusters()
- ]
-
- def run(self):
- _, tmpf = tempfile.mkstemp(prefix="refcat-")
- for target in self.input():
- shellout("""
- zstdcat -T0 {input} |
- skate-bref-id |
- zstd -T0 >> {output}
- """,
- input=target.path,
- output=tmpf)
- luigi.LocalTarget(tmpf).move(self.output().path)
-
- def output(self):
- return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd)
-
-
-# ==== V3 related
-
-# ==== RG title match example
-
-
-class RGSitemapToRelease(Refcat):
- """
- Turn sitemap data to skeleton release.
- """
- def run(self):
- link = "https://archive.org/download/rg_sitemap_2021_02_23/rg_sitemap_2021_02_23.ndj.zst"
- output = shellout("""
- curl -sL {link} |
- zstdcat -T0 |
- parallel --block 10M -j 16 --pipe "jq -rc '{{\"title\": .title, \"extra\": {{\"rg\": {{\"sitemap\": true}}}}}}'" |
- zstd -T0 -c > {output}
- """,
- link=link)
- luigi.LocalTarget(output).move(self.output().path)
-
- def output(self):
- return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd)
-
-
-class RGSitemapFatcatMerged(Refcat):
- """
- A minimal combined fatcat and RG dataset.
- """
- def requires(self):
- return [RGSitemapToRelease(), ReleaseExportTitleOnly()]
-
- def run(self):
- _, tmpf = tempfile.mkstemp(prefix="refcat-")
- for target in self.input():
- shellout("""cat {file} >> {output}""", file=target.path, output=tmpf)
- luigi.LocalTarget(tmpf).move(self.output().path)
-
- def output(self):
- return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd)
-
-
-class RGSitemapFatcatSortedKeys(Refcat):
- """
- Extract keys and sort.
- """
- def requires(self):
- return RGSitemapFatcatMerged()
-
- def run(self):
- output = shellout("""
- zstdcat -T0 {input} |
- skate-derive-key -b 50000 -verbose -f tsand |
- LC_ALL=C sort -k2,2 -S 35% --parallel 6 --compress-program pzstd -T {tmpdir} |
- zstd -T0 -c > {output}""",
- tmpdir=self.tmpdir,
- input=self.input().path)
- luigi.LocalTarget(output).move(self.output().path)
-
- def output(self):
- return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd)
-
-
-# ==== MAG
-
-
-class MAGDOI(Refcat):
- """
- List of MAG DOI.
- """
- def requires(self):
- return MAGPapers()
-
- def run(self):
- output = shellout("""
- unpigz -c {input} |
- cut -f3 |
- grep -v ^$ |
- zstd -T0 -c > {output}
- """,
- input=self.input().path)
- luigi.LocalTarget(output).move(self.output().path)
-
- def output(self):
- return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd)
-
-
-# ==== WikipediaCitations
-
-
-class BiblioRefWikiDOISortedKeys(Refcat):
- """
- Sorted DOI keys from wikipedia.
- """
- def requires(self):
- return WikipediaCitationsMinimalDataset()
-
- def run(self):
- output = shellout("""
- cat {input} |
- skate-wikipedia-doi |
- LC_ALL=C sort -S 10% -k2,2 |
- zstd -T0 -c > {output}
- """,
- input=self.input().path)
- luigi.LocalTarget(output).move(self.output().path)
-
- def output(self):
- return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd)
-
-
-class BiblioRefWiki(Refcat):
- def requires(self):
- return {
- "wiki": BiblioRefWikiDOISortedKeys(),
- "releases": FatcatDOI(),
- }
-
- def run(self):
- output = shellout(r"""
- skate-verify -m wiki -r doi -R <(zstdcat -T0 {releases}) -W <(zstdcat -T0 {wiki}) |
- zstd -c -T0 > {output}
- """,
- releases=self.input().get("releases").path,
- wiki=self.input().get("wiki").path)
- luigi.LocalTarget(output).move(self.output().path)
-
- def output(self):
- return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd)
-
-
-# ==== Prepare unmatched
-
-
-class BiblioRefSortedIdent(Refcat):
- def requires(self):
- return BiblioRefV2()
-
- def run(self):
- output = shellout("""
- zstdcat -T0 {input} |
- skate-derive-key -b 50000 -verbose -F source_release_ident |
- LC_ALL=C sort -k2,2 -S 35% --parallel 6 --compress-program pzstd -T {tmpdir} |
- zstd -T0 -c > {output}
- """,
- tmpdir=self.tmpdir,
- input=self.input().path)
- luigi.LocalTarget(output).move(self.output().path)
-
- def output(self):
- return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd)
-
-
-class RefsSortedIdent(Refcat):
- def requires(self):
- return RefsWithUnstructured()
-
- def run(self):
- output = shellout("""
- zstdcat -T0 {input} |
- skate-derive-key -b 50000 -verbose -F release_ident |
- LC_ALL=C sort -k2,2 -S 35% --parallel 6 --compress-program pzstd -T {tmpdir} |
- zstd -T0 -c > {output}
- """,
- tmpdir=self.tmpdir,
- input=self.input().path)
- luigi.LocalTarget(output).move(self.output().path)
-
- def output(self):
- return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd)
-
-
-# OL
-
-
-class WithISBN(Refcat):
- """
- Keeps converted refs with isbn.
- """
- def requires(self):
- return RefsToRelease()
-
- def run(self):
- output = shellout("""
- zstdcat -T0 {input} |
- parallel -j {n} --block 10M --pipe "jq -rc 'select(.ext_ids.isbn != null)'" |
- zstd -T0 -c > {output}
- """,
- n=self.n,
- tmpdir=self.tmpdir,
- input=self.input().path)
- luigi.LocalTarget(output).move(self.output().path)
-
- def output(self):
- return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd)
-
-
-class OpenLibraryWorks(Refcat):
- """
- Extract just the works.
- """
- def requires(self):
- return OpenLibraryDump()
-
- def run(self):
- output = shellout("""
- zstdcat -T0 {input} |
- parallel -j {n} --block 10M --pipe "jq -rc 'select(.type == \\"work\\")'" |
- zstd -T0 -c > {output}
- """,
- n=self.n,
- tmpdir=self.tmpdir,
- input=self.input().path)
- luigi.LocalTarget(output).move(self.output().path)
-
- def output(self):
- return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd)
diff --git a/python/refcat/base.py b/python/refcat/base.py
index b36cae5..13e2324 100644
--- a/python/refcat/base.py
+++ b/python/refcat/base.py
@@ -4,13 +4,13 @@ Various utilities, copied from https://pypi.org/project/gluish/.
import datetime
import hashlib
+import logging
import os
import random
import re
import string
import subprocess
import tempfile
-import logging
import luigi
diff --git a/python/refcat/cli.py b/python/refcat/cli.py
index a490553..6bb3073 100644
--- a/python/refcat/cli.py
+++ b/python/refcat/cli.py
@@ -18,6 +18,7 @@ To install completion run:
$ source <(refcat.pyz completion)
"""
+import io
import logging
import os
import subprocess
@@ -32,9 +33,9 @@ from luigi.task_register import TaskClassNotFoundException
from refcat import __version__
from refcat.deps import dump_deps, dump_deps_dot
+from refcat.report import *
from refcat.settings import LOGGING_CONF_FILE, settings
from refcat.tasks import *
-from refcat.techreport import *
from refcat.utils import columnize
# These are utility classes of luigi.
@@ -58,6 +59,32 @@ suppress_task_names = [
]
+def columnize(lines, term_width=80, indent=0, pad=2):
+ n_lines = len(lines)
+ if n_lines == 0:
+ return
+
+ col_width = max(len(line) for line in lines)
+ n_cols = int((term_width + pad - indent) / (col_width + pad))
+ n_cols = min(n_lines, max(1, n_cols))
+
+ col_len = int(n_lines / n_cols) + (0 if n_lines % n_cols == 0 else 1)
+ if (n_cols - 1) * col_len >= n_lines:
+ n_cols -= 1
+
+ cols = [lines[i * col_len:i * col_len + col_len] for i in range(n_cols)]
+
+ rows = list(zip(*cols))
+ rows_missed = zip(*[col[len(rows):] for col in cols[:-1]])
+ rows.extend(rows_missed)
+
+ sio = io.StringIO()
+ for row in rows:
+ sio.write(" " * indent + (" " * pad).join(line.ljust(col_width) for line in row) + "\n")
+
+ return sio.getvalue()
+
+
def effective_task_names():
"""
Runnable, relevant task names.
diff --git a/python/refcat/techreport.py b/python/refcat/report.py
index 4fe5771..8060aa3 100644
--- a/python/refcat/techreport.py
+++ b/python/refcat/report.py
@@ -2,8 +2,9 @@
Tasks for techreport.
"""
import luigi
-from refcat.tasks import Refcat, OpenCitations, BrefWithDOI
-from refcat.base import shellout, Zstd
+
+from refcat.base import Zstd, shellout
+from refcat.tasks import BrefWithDOI, OpenCitations, Refcat
class COCIDOIOnly(Refcat):
diff --git a/python/refcat/utils.py b/python/refcat/utils.py
deleted file mode 100644
index 30f3593..0000000
--- a/python/refcat/utils.py
+++ /dev/null
@@ -1,31 +0,0 @@
-"""
-Assorted utilities.
-"""
-
-import io
-
-
-def columnize(lines, term_width=80, indent=0, pad=2):
- n_lines = len(lines)
- if n_lines == 0:
- return
-
- col_width = max(len(line) for line in lines)
- n_cols = int((term_width + pad - indent) / (col_width + pad))
- n_cols = min(n_lines, max(1, n_cols))
-
- col_len = int(n_lines / n_cols) + (0 if n_lines % n_cols == 0 else 1)
- if (n_cols - 1) * col_len >= n_lines:
- n_cols -= 1
-
- cols = [lines[i * col_len:i * col_len + col_len] for i in range(n_cols)]
-
- rows = list(zip(*cols))
- rows_missed = zip(*[col[len(rows):] for col in cols[:-1]])
- rows.extend(rows_missed)
-
- sio = io.StringIO()
- for row in rows:
- sio.write(" " * indent + (" " * pad).join(line.ljust(col_width) for line in row) + "\n")
-
- return sio.getvalue()