From 2e2bbc2ab9642675abdec6696b8862e67593323a Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 3 Oct 2019 15:31:14 -0700 Subject: export raw affiliation strings for analysis --- extra/sql_dumps/dump_affiliations.sql | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 extra/sql_dumps/dump_affiliations.sql diff --git a/extra/sql_dumps/dump_affiliations.sql b/extra/sql_dumps/dump_affiliations.sql new file mode 100644 index 00000000..3371b35c --- /dev/null +++ b/extra/sql_dumps/dump_affiliations.sql @@ -0,0 +1,17 @@ + +BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE; + +COPY (SELECT release_ident.id, release_contrib.raw_affiliation + FROM release_contrib + INNER JOIN release_ident ON release_ident.rev_id = release_contrib.release_rev + WHERE release_ident.is_live = 't' AND release_ident.redirect_id IS NULL + AND release_contrib.raw_affiliation IS NOT NULL) + TO '/tmp/fatcat_affiliations.tsv' + WITH NULL ''; + +ROLLBACK; + +-- Post processing: +-- +-- cut -f2 fatcat_affiliations.tsv | sort -S 4G | uniq -c | sort -nr | gzip > fatcat_affiliations.counts.txt.gz +-- gzip fatcat_affiliations.tsv -- cgit v1.2.3