From 7c6afa0a21883dc8037f3d021246db24eef39b41 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 29 Nov 2021 15:02:27 -0800 Subject: clean up extra/ folder a bit --- extra/checks/.gitignore | 2 - extra/checks/check_extid.sh | 49 ----- extra/checks/check_hashes.sh | 16 -- extra/checks/check_issnl.sh | 15 -- extra/cleanups/.gitignore | 2 + extra/cleanups/check_extid.sh | 49 +++++ extra/cleanups/check_hashes.sh | 16 ++ extra/cleanups/check_issnl.sh | 15 ++ .../scripts/fixup_longtail_issnl_unique.py | 232 +++++++++++++++++++++ extra/collectd_statsd.conf | 22 -- extra/deployment/collectd_statsd.conf | 22 ++ extra/deployment/nginx_fatcat-api | 39 ++++ extra/deployment/nginx_fatcat-web | 35 ++++ extra/deployment/systemd_fatcat-web.service | 16 ++ extra/deployment/uwsgi_fatcat-web.sh | 16 ++ extra/fixups/fixup_longtail_issnl_unique.py | 232 --------------------- extra/nginx_fatcat-api | 39 ---- extra/nginx_fatcat-web | 35 ---- extra/systemd_fatcat-web.service | 16 -- extra/update_gh-pages.sh | 24 --- extra/uwsgi_fatcat-web.sh | 16 -- 21 files changed, 442 insertions(+), 466 deletions(-) delete mode 100644 extra/checks/.gitignore delete mode 100755 extra/checks/check_extid.sh delete mode 100755 extra/checks/check_hashes.sh delete mode 100755 extra/checks/check_issnl.sh create mode 100644 extra/cleanups/.gitignore create mode 100755 extra/cleanups/check_extid.sh create mode 100755 extra/cleanups/check_hashes.sh create mode 100755 extra/cleanups/check_issnl.sh create mode 100755 extra/cleanups/scripts/fixup_longtail_issnl_unique.py delete mode 100644 extra/collectd_statsd.conf create mode 100644 extra/deployment/collectd_statsd.conf create mode 100644 extra/deployment/nginx_fatcat-api create mode 100644 extra/deployment/nginx_fatcat-web create mode 100644 extra/deployment/systemd_fatcat-web.service create mode 100644 extra/deployment/uwsgi_fatcat-web.sh delete mode 100755 extra/fixups/fixup_longtail_issnl_unique.py delete mode 100644 extra/nginx_fatcat-api delete mode 100644 extra/nginx_fatcat-web delete mode 100644 extra/systemd_fatcat-web.service delete mode 100755 extra/update_gh-pages.sh delete mode 100644 extra/uwsgi_fatcat-web.sh (limited to 'extra') diff --git a/extra/checks/.gitignore b/extra/checks/.gitignore deleted file mode 100644 index 431c3bbc..00000000 --- a/extra/checks/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -*.txt -*.tsv diff --git a/extra/checks/check_extid.sh b/extra/checks/check_extid.sh deleted file mode 100755 index f74f50b6..00000000 --- a/extra/checks/check_extid.sh +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env bash - -set -e -u -o pipefail - -export LC_ALL=C - -EXTID_FILE=$1 - -zcat $EXTID_FILE \ - | awk '{print $3 "\t" $1}' \ - | rg -v '^\t' \ - | sort -S 4G \ - > doi_ident.tsv -zcat $EXTID_FILE \ - | awk '{print $4 "\t" $1}' \ - | rg -v '^\t' \ - | sort -S 4G \ - > pmid_ident.tsv -zcat $EXTID_FILE \ - | awk '{print $5 "\t" $1}' \ - | rg -v '^\t' \ - | sort -S 4G \ - > pmcid_ident.tsv -zcat $EXTID_FILE \ - | awk '{print $6 "\t" $1}' \ - | rg -v '^\t' \ - | sort -S 4G \ - > wikidata_ident.tsv - -# these identifiers aren't fixed-width, so we need to join (sigh) -cut -f1 doi_ident.tsv \ - | uniq -d \ - | join -t$'\t' - doi_ident.tsv \ - > doi_ident.dupes.tsv -cut -f1 pmid_ident.tsv \ - | uniq -d \ - | join -t$'\t' - pmid_ident.tsv \ - > pmid_ident.dupes.tsv -cut -f1 pmcid_ident.tsv \ - | uniq -d \ - | join -t$'\t' - pmcid_ident.tsv \ - > pmcid_ident.dupes.tsv -cut -f1 wikidata_ident.tsv \ - | uniq -d \ - | join -t$'\t' - wikidata_ident.tsv \ - > wikidata_ident.dupes.tsv - -wc -l doi_ident.dupes.tsv pmid_ident.dupes.tsv pmcid_ident.dupes.tsv wikidata_ident.dupes.tsv >> counts.txt - diff --git a/extra/checks/check_hashes.sh b/extra/checks/check_hashes.sh deleted file mode 100755 index 94102329..00000000 --- a/extra/checks/check_hashes.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/usr/bin/env bash - -set -e -u -o pipefail - -export LC_ALL=C - -HASH_FILE=$1 - -zcat $HASH_FILE \ - | awk '{print $3 "\t" $1}' \ - | rg -v '^\t' \ - | sort -S 4G \ - | uniq -d -w 40 \ - > sha1_ident.dupes.tsv - -wc -l sha1_ident.dupes.tsv >> counts.txt diff --git a/extra/checks/check_issnl.sh b/extra/checks/check_issnl.sh deleted file mode 100755 index a28695e7..00000000 --- a/extra/checks/check_issnl.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/usr/bin/env bash - -set -e -u -o pipefail - -export LC_ALL=C - -CONTAINER_DUMP=$1 - -zcat $CONTAINER_DUMP \ - | jq '[.issnl, .ident] | @tsv' -r \ - | sort -S 4G \ - | uniq -D -w 9 \ - > issnl_ident.dupes.tsv - -wc -l issnl_ident.dupes.tsv >> counts.txt diff --git a/extra/cleanups/.gitignore b/extra/cleanups/.gitignore new file mode 100644 index 00000000..431c3bbc --- /dev/null +++ b/extra/cleanups/.gitignore @@ -0,0 +1,2 @@ +*.txt +*.tsv diff --git a/extra/cleanups/check_extid.sh b/extra/cleanups/check_extid.sh new file mode 100755 index 00000000..f74f50b6 --- /dev/null +++ b/extra/cleanups/check_extid.sh @@ -0,0 +1,49 @@ +#!/usr/bin/env bash + +set -e -u -o pipefail + +export LC_ALL=C + +EXTID_FILE=$1 + +zcat $EXTID_FILE \ + | awk '{print $3 "\t" $1}' \ + | rg -v '^\t' \ + | sort -S 4G \ + > doi_ident.tsv +zcat $EXTID_FILE \ + | awk '{print $4 "\t" $1}' \ + | rg -v '^\t' \ + | sort -S 4G \ + > pmid_ident.tsv +zcat $EXTID_FILE \ + | awk '{print $5 "\t" $1}' \ + | rg -v '^\t' \ + | sort -S 4G \ + > pmcid_ident.tsv +zcat $EXTID_FILE \ + | awk '{print $6 "\t" $1}' \ + | rg -v '^\t' \ + | sort -S 4G \ + > wikidata_ident.tsv + +# these identifiers aren't fixed-width, so we need to join (sigh) +cut -f1 doi_ident.tsv \ + | uniq -d \ + | join -t$'\t' - doi_ident.tsv \ + > doi_ident.dupes.tsv +cut -f1 pmid_ident.tsv \ + | uniq -d \ + | join -t$'\t' - pmid_ident.tsv \ + > pmid_ident.dupes.tsv +cut -f1 pmcid_ident.tsv \ + | uniq -d \ + | join -t$'\t' - pmcid_ident.tsv \ + > pmcid_ident.dupes.tsv +cut -f1 wikidata_ident.tsv \ + | uniq -d \ + | join -t$'\t' - wikidata_ident.tsv \ + > wikidata_ident.dupes.tsv + +wc -l doi_ident.dupes.tsv pmid_ident.dupes.tsv pmcid_ident.dupes.tsv wikidata_ident.dupes.tsv >> counts.txt + diff --git a/extra/cleanups/check_hashes.sh b/extra/cleanups/check_hashes.sh new file mode 100755 index 00000000..94102329 --- /dev/null +++ b/extra/cleanups/check_hashes.sh @@ -0,0 +1,16 @@ +#!/usr/bin/env bash + +set -e -u -o pipefail + +export LC_ALL=C + +HASH_FILE=$1 + +zcat $HASH_FILE \ + | awk '{print $3 "\t" $1}' \ + | rg -v '^\t' \ + | sort -S 4G \ + | uniq -d -w 40 \ + > sha1_ident.dupes.tsv + +wc -l sha1_ident.dupes.tsv >> counts.txt diff --git a/extra/cleanups/check_issnl.sh b/extra/cleanups/check_issnl.sh new file mode 100755 index 00000000..a28695e7 --- /dev/null +++ b/extra/cleanups/check_issnl.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash + +set -e -u -o pipefail + +export LC_ALL=C + +CONTAINER_DUMP=$1 + +zcat $CONTAINER_DUMP \ + | jq '[.issnl, .ident] | @tsv' -r \ + | sort -S 4G \ + | uniq -D -w 9 \ + > issnl_ident.dupes.tsv + +wc -l issnl_ident.dupes.tsv >> counts.txt diff --git a/extra/cleanups/scripts/fixup_longtail_issnl_unique.py b/extra/cleanups/scripts/fixup_longtail_issnl_unique.py new file mode 100755 index 00000000..ea615a13 --- /dev/null +++ b/extra/cleanups/scripts/fixup_longtail_issnl_unique.py @@ -0,0 +1,232 @@ +#!/usr/bin/env python3 + +""" +This file must be moved to the fatcat:python/ directory (aka, not in +fatcat:extra/fixups) to run. It's a "one-off", so probably will bitrot pretty +quickly. There are no tests. + +Example invocation: + + zcat /srv/fatcat/datasets/2018-09-23-0405.30-dumpgrobidmetainsertable.longtail_join.filtered.tsv.gz | ./fixup_longtail_issnl_unique.py /srv/fatcat/datasets/single_domain_issnl.tsv - + +See also: +- bnewbold/scratch:mellon/201904_longtail_issn.md +- aitio:/rapida/OA-JOURNAL-TESTCRAWL-TWO-2018 +- https://archive.org/details/OA-JOURNAL-TESTCRAWL-TWO-2018-extra += https://archive.org/download/ia_longtail_dumpgrobidmetainsertable_2018-09-23/2018-09-23-0405.30-dumpgrobidmetainsertable.longtail_join.filtered.tsv.gz + +QA notes: + +- everything on revistas.uv.mx linked to 2395-9495, which is only one journal + on that domain. blacklist 'revistas' in the domain? +- afjg3yjdjbf2dad47t5jq7nlbe => 2305-7254 ok match but not perfect (wrong year + of conference). probably better than nothing. +- elib.mi.sanu.ac.rs has several journals on domain. container_name was correct. +- revistavirtual.ucn.edu.co has 2x journals +- lpchkxkp5jecdgrab33fxodd7y bad match +- k36web33jvf25by64gop4yil7q an IR, not a journal (ok) +- hvxercwasjhotpewb5xfadyyle good match, though only an abstract (in URL). full + articles get DOIs +- release_epkiok6y3zhsnp3no2lkljznza not a paper; journal match batch (cato, wtf) +- release_b3jolh25mbg4djrqotgosyeike jfr.unibo.it good +- release_bzr35evb4bdd3mxex6gxn6dcyy conf.ostis.net good? +- uzspace.uzulu.ac.za IR, not a container +- release_5lt36yy3vre2nnig46toy67kdi wrong, multiple journals +- release_54hmv5gvtjghjk7rpcbp2pn2ky good +- release_6h7doxfaxnao3jm7f6jkfdpdwm good +- release_6pio5hz6bvawfnodhkvmfk4jei correct but stub +- release_7oobqygqczapbgdvvgbxfyvqli correct +- release_tsljmbevpzfpxiezzv7puwbilq good + +general notes: +- GROBID works pretty well. references look pretty good, should match. there is + a non-trivial fraction of non-journal content, but it isn't too bad +- this "single-journal domain" premise doesn't work +- could probably do a subset based on "is the journal name in the domain name", + or "is domain acronym of journal name" +- surprising number of IRs with ISSNs in here +- might have better luck blacklisting out latin american TLDs, which tend to + host many journals? +""" + +import os, sys, argparse +import json +import sqlite3 +import itertools + +import fatcat_openapi_client +from fatcat_tools import authenticated_api +from fatcat_tools.importers.common import EntityImporter, clean, LinePusher +from fatcat_tools.importers.arabesque import b32_hex + + +class LongtailIssnlSingleDomainFixup(EntityImporter): + """ + Fixup script for bootstrap longtail OA release entities which don't have a + container but are confidently associated with an ISSN-L based on file + domain. + + Expected to be a one-time fixup impacting about 600k entities (around half + the longtail OA batch). + + Reads in a mapping of unique domain-ISSNL mappings, and then iterates over + the original matched import batch file. For each line in the later: + + - checks if in-scope based on domain-ISSNL map + - uses API to lookup file (by SHA-1) and confirm domain in URL list + - look up releases for file and retain the longtail-oa ones (an extra flag) + - if release is longtail-oa and no container, set the container based on + ISSN-L (using cached lookup) + - use EntityImporter stuff to manage update/editgroup queue + """ + + def __init__(self, api, domain_issnl_tsv_file, **kwargs): + + eg_desc = kwargs.pop('editgroup_description', + "Fixup for longtail OA releases that can be matched to specific container by file domain / ISSN-L mapping") + eg_extra = kwargs.pop('editgroup_extra', dict()) + eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.LongtailIssnlSingleDomainFixup') + super().__init__(api, + editgroup_description=eg_desc, + editgroup_extra=eg_extra, + **kwargs) + + self._domain_issnl_map = self.load_domain_issnl(domain_issnl_tsv_file) + self._issnl_container_map = dict() + + def load_domain_issnl(self, tsv_file): + print("Loading domain ISSN-L file...") + m = dict() + for l in tsv_file: + l = l.strip().split('\t') + assert len(l) == 2 + domain = l[0].lower() + issnl = l[1] + assert len(issnl) == 9 and issnl[4] == '-' + m[domain] = issnl + print("Got {} matchings.".format(len(m))) + return m + + def want(self, raw_record): + # do it all in parse_record() + return True + + def parse_record(self, row): + """ + TSV rows: + - sha1 b32 key + - JSON string: CDX-ish + - surt + - url + - + - mime + - size (?) + - JSON string: grobid metadata + """ + + # parse row + row = row.split('\t') + assert len(row) == 5 + sha1 = b32_hex(row[0][5:]) + cdx_dict = json.loads(row[1]) + url = cdx_dict['url'] + domain = url.split('/')[2].lower() + + if not domain: + self.counts['skip-domain-blank'] += 1 + return None + + # domain in scope? + issnl = self._domain_issnl_map.get(domain) + if not issnl: + self.counts['skip-domain-scope'] += 1 + return None + if 'revistas' in domain.lower().split('.'): + self.counts['skip-domain-revistas'] += 1 + return None + + # lookup file + #print(sha1) + try: + file_entity = self.api.lookup_file(sha1=sha1, expand="releases") + except fatcat_openapi_client.rest.ApiException as err: + if err.status == 404: + self.counts['skip-file-not-found'] += 1 + return None + else: + raise err + + # container ident + container_id = self.lookup_issnl(issnl) + if not container_id: + self.counts['skip-container-not-found'] += 1 + return None + + # confirm domain + url_domain_match = False + for furl in file_entity.urls: + fdomain = furl.url.split('/')[2].lower() + if domain == fdomain: + url_domain_match = True + break + if not url_domain_match: + self.counts['skip-no-domain-match'] += 1 + return None + + # fetch releases + releases = [r for r in file_entity.releases if (r.extra.get('longtail_oa') == True and r.container_id == None)] + if not releases: + #print(file_entity.releases) + self.counts['skip-no-releases'] += 1 + return None + + # fetch full release objects (need abstract, etc, for updating) + releases = [self.api.get_release(r.ident) for r in releases] + + # set container_id + for r in releases: + r.container_id = container_id + return releases + + def try_update(self, re_list): + for re in re_list: + self.api.update_release(self.get_editgroup_id(), re.ident, re) + self.counts['update'] += 1 + return False + + def insert_batch(self, batch): + raise NotImplementedError + +def run_fixup(args): + fmi = LongtailIssnlSingleDomainFixup(args.api, + args.domain_issnl_tsv_file, + edit_batch_size=args.batch_size) + LinePusher(fmi, args.insertable_tsv_file).run() + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('--api-host-url', + default="http://localhost:9411/v0", + help="connect to this host/port") + parser.add_argument('--batch-size', + help="size of batch to send", + default=50, type=int) + parser.add_argument('domain_issnl_tsv_file', + help="domain/ISSNL mapping TSV file", + type=argparse.FileType('r')) + parser.add_argument('insertable_tsv_file', + help="dumpgrobidmetainsertable TSV file to work over", + default=sys.stdin, type=argparse.FileType('r')) + + auth_var = "FATCAT_AUTH_SANDCRAWLER" + + args = parser.parse_args() + + args.api = authenticated_api( + args.api_host_url, + # token is an optional kwarg (can be empty string, None, etc) + token=os.environ.get(auth_var)) + run_fixup(args) + +if __name__ == '__main__': + main() diff --git a/extra/collectd_statsd.conf b/extra/collectd_statsd.conf deleted file mode 100644 index a434ed9f..00000000 --- a/extra/collectd_statsd.conf +++ /dev/null @@ -1,22 +0,0 @@ - -# This configures collectd to accept statsd metrics. They will end up under, -# eg, .statsd.derive-*. On a local machine they can be viewed with, -# eg, kcollectd, which is helpful for basic debugging of metrics. -# -# To use, copy this file to, eg: /etc/collectd/collectd.conf.d/fatcat_statsd.conf -# then restart collectd: sudo service collectd restart - -LoadPlugin statsd - - - Host "127.0.0.1" - #Host "::1" - Port "8125" - DeleteSets true - DeleteCounters true - DeleteTimers true - DeleteGauges true - - TimerCount true - #CounterSum false - diff --git a/extra/deployment/collectd_statsd.conf b/extra/deployment/collectd_statsd.conf new file mode 100644 index 00000000..a434ed9f --- /dev/null +++ b/extra/deployment/collectd_statsd.conf @@ -0,0 +1,22 @@ + +# This configures collectd to accept statsd metrics. They will end up under, +# eg, .statsd.derive-*. On a local machine they can be viewed with, +# eg, kcollectd, which is helpful for basic debugging of metrics. +# +# To use, copy this file to, eg: /etc/collectd/collectd.conf.d/fatcat_statsd.conf +# then restart collectd: sudo service collectd restart + +LoadPlugin statsd + + + Host "127.0.0.1" + #Host "::1" + Port "8125" + DeleteSets true + DeleteCounters true + DeleteTimers true + DeleteGauges true + + TimerCount true + #CounterSum false + diff --git a/extra/deployment/nginx_fatcat-api b/extra/deployment/nginx_fatcat-api new file mode 100644 index 00000000..13c1da99 --- /dev/null +++ b/extra/deployment/nginx_fatcat-api @@ -0,0 +1,39 @@ + +upstream fatcatd { + server localhost:9411; +} + +server { + listen 80; + listen [::]:80; + listen 443 ssl spdy; + listen [::]:443 ssl spdy; + server_name api.fatcat.wiki; + + ssl_certificate /etc/letsencrypt/live/fatcat.wiki/fullchain.pem; + ssl_certificate_key /etc/letsencrypt/live/fatcat.wiki/privkey.pem; + + #add_header Content-Security-Policy "default-src 'self' 'unsafe-inline' 'unsafe-eval'; style-src 'self' 'unsafe-inline'"; + add_header X-Frame-Options "SAMEORIGIN"; # 'always' if nginx > 1.7.5 + add_header X-Content-Type-Options "nosniff"; # 'always' if nginx > 1.7.5 + add_header X-Xss-Protection "1"; + # Enable STS with one year period (breaks http; optional) + #add_header Strict-Transport-Security "max-age=31557600; includeSubDomains"; + + access_log /var/log/nginx/access.log; + + location / { + root /srv/http/default/www/; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header Host $http_host; + proxy_redirect off; + proxy_pass http://fatcatd; + } + + # Let's Encrypt SSL Certs + location /.well-known/acme-challenge/ { + root /var/www/letsencrypt; + autoindex off; + } +} diff --git a/extra/deployment/nginx_fatcat-web b/extra/deployment/nginx_fatcat-web new file mode 100644 index 00000000..7909b0be --- /dev/null +++ b/extra/deployment/nginx_fatcat-web @@ -0,0 +1,35 @@ + +server { + listen 80; + listen [::]:80; + listen 443 ssl spdy; + listen [::]:443 ssl spdy; + server_name fatcat.wiki www.fatcat.wiki; + + ssl_certificate /etc/letsencrypt/live/fatcat.wiki/fullchain.pem; + ssl_certificate_key /etc/letsencrypt/live/fatcat.wiki/privkey.pem; + + #add_header Content-Security-Policy "default-src 'self' 'unsafe-inline' 'unsafe-eval'; style-src 'self' 'unsafe-inline'"; + add_header X-Frame-Options "SAMEORIGIN"; # 'always' if nginx > 1.7.5 + add_header X-Content-Type-Options "nosniff"; # 'always' if nginx > 1.7.5 + add_header X-Xss-Protection "1"; + # Enable STS with one year period (breaks http; optional) + #add_header Strict-Transport-Security "max-age=31557600; includeSubDomains"; + + access_log /var/log/nginx/access.log; + + location / { + try_files $uri @fatcat-web; + } + + location @fatcat-web { + include uwsgi_params; + uwsgi_pass unix:/var/run/fatcat-web/uwsgi.sock; + } + + # Let's Encrypt SSL Certs + location /.well-known/acme-challenge/ { + root /var/www/letsencrypt; + autoindex off; + } +} diff --git a/extra/deployment/systemd_fatcat-web.service b/extra/deployment/systemd_fatcat-web.service new file mode 100644 index 00000000..ad8b2bec --- /dev/null +++ b/extra/deployment/systemd_fatcat-web.service @@ -0,0 +1,16 @@ +[Unit] +Description=fatcat web interface +# TODO: would add syslog.target here if we used it +After=network.target + +[Service] +WorkingDirectory=/srv/fatcat/src/python +SyslogIdentifier=fatcat-web +#Environment="TMPDIR=/run/grobid/tmp/" +# XXX: insert uwsgi_fatcat-web.sh here +ExecStart= +User=fatcat +Type=simple + +[Install] +WantedBy=multi-user.target diff --git a/extra/deployment/uwsgi_fatcat-web.sh b/extra/deployment/uwsgi_fatcat-web.sh new file mode 100644 index 00000000..112d7857 --- /dev/null +++ b/extra/deployment/uwsgi_fatcat-web.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +set -eu + +# sudo mkdir /var/run/fatcat-web +# sudo chown bnewbold:bnewbold /var/run/fatcat-web/ + +FATCAT_WEB_DIR=python +cd $FATCAT_WEB_DIR + +uwsgi \ + -s /var/run/fatcat-web/uwsgi.sock \ + --manage-script-name --mount \ + --plugin python3 \ + --virtualenv .venv \ + --mount /:fatcat:app diff --git a/extra/fixups/fixup_longtail_issnl_unique.py b/extra/fixups/fixup_longtail_issnl_unique.py deleted file mode 100755 index ea615a13..00000000 --- a/extra/fixups/fixup_longtail_issnl_unique.py +++ /dev/null @@ -1,232 +0,0 @@ -#!/usr/bin/env python3 - -""" -This file must be moved to the fatcat:python/ directory (aka, not in -fatcat:extra/fixups) to run. It's a "one-off", so probably will bitrot pretty -quickly. There are no tests. - -Example invocation: - - zcat /srv/fatcat/datasets/2018-09-23-0405.30-dumpgrobidmetainsertable.longtail_join.filtered.tsv.gz | ./fixup_longtail_issnl_unique.py /srv/fatcat/datasets/single_domain_issnl.tsv - - -See also: -- bnewbold/scratch:mellon/201904_longtail_issn.md -- aitio:/rapida/OA-JOURNAL-TESTCRAWL-TWO-2018 -- https://archive.org/details/OA-JOURNAL-TESTCRAWL-TWO-2018-extra -= https://archive.org/download/ia_longtail_dumpgrobidmetainsertable_2018-09-23/2018-09-23-0405.30-dumpgrobidmetainsertable.longtail_join.filtered.tsv.gz - -QA notes: - -- everything on revistas.uv.mx linked to 2395-9495, which is only one journal - on that domain. blacklist 'revistas' in the domain? -- afjg3yjdjbf2dad47t5jq7nlbe => 2305-7254 ok match but not perfect (wrong year - of conference). probably better than nothing. -- elib.mi.sanu.ac.rs has several journals on domain. container_name was correct. -- revistavirtual.ucn.edu.co has 2x journals -- lpchkxkp5jecdgrab33fxodd7y bad match -- k36web33jvf25by64gop4yil7q an IR, not a journal (ok) -- hvxercwasjhotpewb5xfadyyle good match, though only an abstract (in URL). full - articles get DOIs -- release_epkiok6y3zhsnp3no2lkljznza not a paper; journal match batch (cato, wtf) -- release_b3jolh25mbg4djrqotgosyeike jfr.unibo.it good -- release_bzr35evb4bdd3mxex6gxn6dcyy conf.ostis.net good? -- uzspace.uzulu.ac.za IR, not a container -- release_5lt36yy3vre2nnig46toy67kdi wrong, multiple journals -- release_54hmv5gvtjghjk7rpcbp2pn2ky good -- release_6h7doxfaxnao3jm7f6jkfdpdwm good -- release_6pio5hz6bvawfnodhkvmfk4jei correct but stub -- release_7oobqygqczapbgdvvgbxfyvqli correct -- release_tsljmbevpzfpxiezzv7puwbilq good - -general notes: -- GROBID works pretty well. references look pretty good, should match. there is - a non-trivial fraction of non-journal content, but it isn't too bad -- this "single-journal domain" premise doesn't work -- could probably do a subset based on "is the journal name in the domain name", - or "is domain acronym of journal name" -- surprising number of IRs with ISSNs in here -- might have better luck blacklisting out latin american TLDs, which tend to - host many journals? -""" - -import os, sys, argparse -import json -import sqlite3 -import itertools - -import fatcat_openapi_client -from fatcat_tools import authenticated_api -from fatcat_tools.importers.common import EntityImporter, clean, LinePusher -from fatcat_tools.importers.arabesque import b32_hex - - -class LongtailIssnlSingleDomainFixup(EntityImporter): - """ - Fixup script for bootstrap longtail OA release entities which don't have a - container but are confidently associated with an ISSN-L based on file - domain. - - Expected to be a one-time fixup impacting about 600k entities (around half - the longtail OA batch). - - Reads in a mapping of unique domain-ISSNL mappings, and then iterates over - the original matched import batch file. For each line in the later: - - - checks if in-scope based on domain-ISSNL map - - uses API to lookup file (by SHA-1) and confirm domain in URL list - - look up releases for file and retain the longtail-oa ones (an extra flag) - - if release is longtail-oa and no container, set the container based on - ISSN-L (using cached lookup) - - use EntityImporter stuff to manage update/editgroup queue - """ - - def __init__(self, api, domain_issnl_tsv_file, **kwargs): - - eg_desc = kwargs.pop('editgroup_description', - "Fixup for longtail OA releases that can be matched to specific container by file domain / ISSN-L mapping") - eg_extra = kwargs.pop('editgroup_extra', dict()) - eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.LongtailIssnlSingleDomainFixup') - super().__init__(api, - editgroup_description=eg_desc, - editgroup_extra=eg_extra, - **kwargs) - - self._domain_issnl_map = self.load_domain_issnl(domain_issnl_tsv_file) - self._issnl_container_map = dict() - - def load_domain_issnl(self, tsv_file): - print("Loading domain ISSN-L file...") - m = dict() - for l in tsv_file: - l = l.strip().split('\t') - assert len(l) == 2 - domain = l[0].lower() - issnl = l[1] - assert len(issnl) == 9 and issnl[4] == '-' - m[domain] = issnl - print("Got {} matchings.".format(len(m))) - return m - - def want(self, raw_record): - # do it all in parse_record() - return True - - def parse_record(self, row): - """ - TSV rows: - - sha1 b32 key - - JSON string: CDX-ish - - surt - - url - - - - mime - - size (?) - - JSON string: grobid metadata - """ - - # parse row - row = row.split('\t') - assert len(row) == 5 - sha1 = b32_hex(row[0][5:]) - cdx_dict = json.loads(row[1]) - url = cdx_dict['url'] - domain = url.split('/')[2].lower() - - if not domain: - self.counts['skip-domain-blank'] += 1 - return None - - # domain in scope? - issnl = self._domain_issnl_map.get(domain) - if not issnl: - self.counts['skip-domain-scope'] += 1 - return None - if 'revistas' in domain.lower().split('.'): - self.counts['skip-domain-revistas'] += 1 - return None - - # lookup file - #print(sha1) - try: - file_entity = self.api.lookup_file(sha1=sha1, expand="releases") - except fatcat_openapi_client.rest.ApiException as err: - if err.status == 404: - self.counts['skip-file-not-found'] += 1 - return None - else: - raise err - - # container ident - container_id = self.lookup_issnl(issnl) - if not container_id: - self.counts['skip-container-not-found'] += 1 - return None - - # confirm domain - url_domain_match = False - for furl in file_entity.urls: - fdomain = furl.url.split('/')[2].lower() - if domain == fdomain: - url_domain_match = True - break - if not url_domain_match: - self.counts['skip-no-domain-match'] += 1 - return None - - # fetch releases - releases = [r for r in file_entity.releases if (r.extra.get('longtail_oa') == True and r.container_id == None)] - if not releases: - #print(file_entity.releases) - self.counts['skip-no-releases'] += 1 - return None - - # fetch full release objects (need abstract, etc, for updating) - releases = [self.api.get_release(r.ident) for r in releases] - - # set container_id - for r in releases: - r.container_id = container_id - return releases - - def try_update(self, re_list): - for re in re_list: - self.api.update_release(self.get_editgroup_id(), re.ident, re) - self.counts['update'] += 1 - return False - - def insert_batch(self, batch): - raise NotImplementedError - -def run_fixup(args): - fmi = LongtailIssnlSingleDomainFixup(args.api, - args.domain_issnl_tsv_file, - edit_batch_size=args.batch_size) - LinePusher(fmi, args.insertable_tsv_file).run() - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument('--api-host-url', - default="http://localhost:9411/v0", - help="connect to this host/port") - parser.add_argument('--batch-size', - help="size of batch to send", - default=50, type=int) - parser.add_argument('domain_issnl_tsv_file', - help="domain/ISSNL mapping TSV file", - type=argparse.FileType('r')) - parser.add_argument('insertable_tsv_file', - help="dumpgrobidmetainsertable TSV file to work over", - default=sys.stdin, type=argparse.FileType('r')) - - auth_var = "FATCAT_AUTH_SANDCRAWLER" - - args = parser.parse_args() - - args.api = authenticated_api( - args.api_host_url, - # token is an optional kwarg (can be empty string, None, etc) - token=os.environ.get(auth_var)) - run_fixup(args) - -if __name__ == '__main__': - main() diff --git a/extra/nginx_fatcat-api b/extra/nginx_fatcat-api deleted file mode 100644 index 13c1da99..00000000 --- a/extra/nginx_fatcat-api +++ /dev/null @@ -1,39 +0,0 @@ - -upstream fatcatd { - server localhost:9411; -} - -server { - listen 80; - listen [::]:80; - listen 443 ssl spdy; - listen [::]:443 ssl spdy; - server_name api.fatcat.wiki; - - ssl_certificate /etc/letsencrypt/live/fatcat.wiki/fullchain.pem; - ssl_certificate_key /etc/letsencrypt/live/fatcat.wiki/privkey.pem; - - #add_header Content-Security-Policy "default-src 'self' 'unsafe-inline' 'unsafe-eval'; style-src 'self' 'unsafe-inline'"; - add_header X-Frame-Options "SAMEORIGIN"; # 'always' if nginx > 1.7.5 - add_header X-Content-Type-Options "nosniff"; # 'always' if nginx > 1.7.5 - add_header X-Xss-Protection "1"; - # Enable STS with one year period (breaks http; optional) - #add_header Strict-Transport-Security "max-age=31557600; includeSubDomains"; - - access_log /var/log/nginx/access.log; - - location / { - root /srv/http/default/www/; - proxy_set_header X-Real-IP $remote_addr; - proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; - proxy_set_header Host $http_host; - proxy_redirect off; - proxy_pass http://fatcatd; - } - - # Let's Encrypt SSL Certs - location /.well-known/acme-challenge/ { - root /var/www/letsencrypt; - autoindex off; - } -} diff --git a/extra/nginx_fatcat-web b/extra/nginx_fatcat-web deleted file mode 100644 index 7909b0be..00000000 --- a/extra/nginx_fatcat-web +++ /dev/null @@ -1,35 +0,0 @@ - -server { - listen 80; - listen [::]:80; - listen 443 ssl spdy; - listen [::]:443 ssl spdy; - server_name fatcat.wiki www.fatcat.wiki; - - ssl_certificate /etc/letsencrypt/live/fatcat.wiki/fullchain.pem; - ssl_certificate_key /etc/letsencrypt/live/fatcat.wiki/privkey.pem; - - #add_header Content-Security-Policy "default-src 'self' 'unsafe-inline' 'unsafe-eval'; style-src 'self' 'unsafe-inline'"; - add_header X-Frame-Options "SAMEORIGIN"; # 'always' if nginx > 1.7.5 - add_header X-Content-Type-Options "nosniff"; # 'always' if nginx > 1.7.5 - add_header X-Xss-Protection "1"; - # Enable STS with one year period (breaks http; optional) - #add_header Strict-Transport-Security "max-age=31557600; includeSubDomains"; - - access_log /var/log/nginx/access.log; - - location / { - try_files $uri @fatcat-web; - } - - location @fatcat-web { - include uwsgi_params; - uwsgi_pass unix:/var/run/fatcat-web/uwsgi.sock; - } - - # Let's Encrypt SSL Certs - location /.well-known/acme-challenge/ { - root /var/www/letsencrypt; - autoindex off; - } -} diff --git a/extra/systemd_fatcat-web.service b/extra/systemd_fatcat-web.service deleted file mode 100644 index ad8b2bec..00000000 --- a/extra/systemd_fatcat-web.service +++ /dev/null @@ -1,16 +0,0 @@ -[Unit] -Description=fatcat web interface -# TODO: would add syslog.target here if we used it -After=network.target - -[Service] -WorkingDirectory=/srv/fatcat/src/python -SyslogIdentifier=fatcat-web -#Environment="TMPDIR=/run/grobid/tmp/" -# XXX: insert uwsgi_fatcat-web.sh here -ExecStart= -User=fatcat -Type=simple - -[Install] -WantedBy=multi-user.target diff --git a/extra/update_gh-pages.sh b/extra/update_gh-pages.sh deleted file mode 100755 index 2771c24e..00000000 --- a/extra/update_gh-pages.sh +++ /dev/null @@ -1,24 +0,0 @@ -#!/bin/bash - -# Note: this script is BROKEN; the resulting docs don't have javascript search, -# throw a javascript error, and don't include private/internal docs. Not a -# priority right now. - -set -e -u -o pipefail - -cd rust -cargo doc -mkdir -p /tmp/fatcat-ghpages -cp -r target/doc/fatcat target/doc/fatcat_openapi /tmp/fatcat-ghpages -cd .. -git checkout gh-pages -mv fatcat fatcat.old_docs || true -mv fatcat_openapi fatcat_openapi.old_docs || true -mv /tmp/fatcat-ghpages/fatcat . -mv /tmp/fatcat-ghpages/fatcat_openapi . -git add fatcat fatcat_openapi -git commit -m "updating rendered manpage for github docs" || true -git checkout master -rm -r /tmp/fatcat-ghpages - -echo "DONE" diff --git a/extra/uwsgi_fatcat-web.sh b/extra/uwsgi_fatcat-web.sh deleted file mode 100644 index 112d7857..00000000 --- a/extra/uwsgi_fatcat-web.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/bin/bash - -set -eu - -# sudo mkdir /var/run/fatcat-web -# sudo chown bnewbold:bnewbold /var/run/fatcat-web/ - -FATCAT_WEB_DIR=python -cd $FATCAT_WEB_DIR - -uwsgi \ - -s /var/run/fatcat-web/uwsgi.sock \ - --manage-script-name --mount \ - --plugin python3 \ - --virtualenv .venv \ - --mount /:fatcat:app -- cgit v1.2.3