diff options
Diffstat (limited to 'extra/cleanups')
-rw-r--r-- | extra/cleanups/.gitignore | 2 | ||||
-rwxr-xr-x | extra/cleanups/check_extid.sh | 49 | ||||
-rwxr-xr-x | extra/cleanups/check_hashes.sh | 16 | ||||
-rwxr-xr-x | extra/cleanups/check_issnl.sh | 15 | ||||
-rwxr-xr-x | extra/cleanups/scripts/fixup_longtail_issnl_unique.py | 232 |
5 files changed, 314 insertions, 0 deletions
diff --git a/extra/cleanups/.gitignore b/extra/cleanups/.gitignore new file mode 100644 index 00000000..431c3bbc --- /dev/null +++ b/extra/cleanups/.gitignore @@ -0,0 +1,2 @@ +*.txt +*.tsv diff --git a/extra/cleanups/check_extid.sh b/extra/cleanups/check_extid.sh new file mode 100755 index 00000000..f74f50b6 --- /dev/null +++ b/extra/cleanups/check_extid.sh @@ -0,0 +1,49 @@ +#!/usr/bin/env bash + +set -e -u -o pipefail + +export LC_ALL=C + +EXTID_FILE=$1 + +zcat $EXTID_FILE \ + | awk '{print $3 "\t" $1}' \ + | rg -v '^\t' \ + | sort -S 4G \ + > doi_ident.tsv +zcat $EXTID_FILE \ + | awk '{print $4 "\t" $1}' \ + | rg -v '^\t' \ + | sort -S 4G \ + > pmid_ident.tsv +zcat $EXTID_FILE \ + | awk '{print $5 "\t" $1}' \ + | rg -v '^\t' \ + | sort -S 4G \ + > pmcid_ident.tsv +zcat $EXTID_FILE \ + | awk '{print $6 "\t" $1}' \ + | rg -v '^\t' \ + | sort -S 4G \ + > wikidata_ident.tsv + +# these identifiers aren't fixed-width, so we need to join (sigh) +cut -f1 doi_ident.tsv \ + | uniq -d \ + | join -t$'\t' - doi_ident.tsv \ + > doi_ident.dupes.tsv +cut -f1 pmid_ident.tsv \ + | uniq -d \ + | join -t$'\t' - pmid_ident.tsv \ + > pmid_ident.dupes.tsv +cut -f1 pmcid_ident.tsv \ + | uniq -d \ + | join -t$'\t' - pmcid_ident.tsv \ + > pmcid_ident.dupes.tsv +cut -f1 wikidata_ident.tsv \ + | uniq -d \ + | join -t$'\t' - wikidata_ident.tsv \ + > wikidata_ident.dupes.tsv + +wc -l doi_ident.dupes.tsv pmid_ident.dupes.tsv pmcid_ident.dupes.tsv wikidata_ident.dupes.tsv >> counts.txt + diff --git a/extra/cleanups/check_hashes.sh b/extra/cleanups/check_hashes.sh new file mode 100755 index 00000000..94102329 --- /dev/null +++ b/extra/cleanups/check_hashes.sh @@ -0,0 +1,16 @@ +#!/usr/bin/env bash + +set -e -u -o pipefail + +export LC_ALL=C + +HASH_FILE=$1 + +zcat $HASH_FILE \ + | awk '{print $3 "\t" $1}' \ + | rg -v '^\t' \ + | sort -S 4G \ + | uniq -d -w 40 \ + > sha1_ident.dupes.tsv + +wc -l sha1_ident.dupes.tsv >> counts.txt diff --git a/extra/cleanups/check_issnl.sh b/extra/cleanups/check_issnl.sh new file mode 100755 index 00000000..a28695e7 --- /dev/null +++ b/extra/cleanups/check_issnl.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash + +set -e -u -o pipefail + +export LC_ALL=C + +CONTAINER_DUMP=$1 + +zcat $CONTAINER_DUMP \ + | jq '[.issnl, .ident] | @tsv' -r \ + | sort -S 4G \ + | uniq -D -w 9 \ + > issnl_ident.dupes.tsv + +wc -l issnl_ident.dupes.tsv >> counts.txt diff --git a/extra/cleanups/scripts/fixup_longtail_issnl_unique.py b/extra/cleanups/scripts/fixup_longtail_issnl_unique.py new file mode 100755 index 00000000..ea615a13 --- /dev/null +++ b/extra/cleanups/scripts/fixup_longtail_issnl_unique.py @@ -0,0 +1,232 @@ +#!/usr/bin/env python3 + +""" +This file must be moved to the fatcat:python/ directory (aka, not in +fatcat:extra/fixups) to run. It's a "one-off", so probably will bitrot pretty +quickly. There are no tests. + +Example invocation: + + zcat /srv/fatcat/datasets/2018-09-23-0405.30-dumpgrobidmetainsertable.longtail_join.filtered.tsv.gz | ./fixup_longtail_issnl_unique.py /srv/fatcat/datasets/single_domain_issnl.tsv - + +See also: +- bnewbold/scratch:mellon/201904_longtail_issn.md +- aitio:/rapida/OA-JOURNAL-TESTCRAWL-TWO-2018 +- https://archive.org/details/OA-JOURNAL-TESTCRAWL-TWO-2018-extra += https://archive.org/download/ia_longtail_dumpgrobidmetainsertable_2018-09-23/2018-09-23-0405.30-dumpgrobidmetainsertable.longtail_join.filtered.tsv.gz + +QA notes: + +- everything on revistas.uv.mx linked to 2395-9495, which is only one journal + on that domain. blacklist 'revistas' in the domain? +- afjg3yjdjbf2dad47t5jq7nlbe => 2305-7254 ok match but not perfect (wrong year + of conference). probably better than nothing. +- elib.mi.sanu.ac.rs has several journals on domain. container_name was correct. +- revistavirtual.ucn.edu.co has 2x journals +- lpchkxkp5jecdgrab33fxodd7y bad match +- k36web33jvf25by64gop4yil7q an IR, not a journal (ok) +- hvxercwasjhotpewb5xfadyyle good match, though only an abstract (in URL). full + articles get DOIs +- release_epkiok6y3zhsnp3no2lkljznza not a paper; journal match batch (cato, wtf) +- release_b3jolh25mbg4djrqotgosyeike jfr.unibo.it good +- release_bzr35evb4bdd3mxex6gxn6dcyy conf.ostis.net good? +- uzspace.uzulu.ac.za IR, not a container +- release_5lt36yy3vre2nnig46toy67kdi wrong, multiple journals +- release_54hmv5gvtjghjk7rpcbp2pn2ky good +- release_6h7doxfaxnao3jm7f6jkfdpdwm good +- release_6pio5hz6bvawfnodhkvmfk4jei correct but stub +- release_7oobqygqczapbgdvvgbxfyvqli correct +- release_tsljmbevpzfpxiezzv7puwbilq good + +general notes: +- GROBID works pretty well. references look pretty good, should match. there is + a non-trivial fraction of non-journal content, but it isn't too bad +- this "single-journal domain" premise doesn't work +- could probably do a subset based on "is the journal name in the domain name", + or "is domain acronym of journal name" +- surprising number of IRs with ISSNs in here +- might have better luck blacklisting out latin american TLDs, which tend to + host many journals? +""" + +import os, sys, argparse +import json +import sqlite3 +import itertools + +import fatcat_openapi_client +from fatcat_tools import authenticated_api +from fatcat_tools.importers.common import EntityImporter, clean, LinePusher +from fatcat_tools.importers.arabesque import b32_hex + + +class LongtailIssnlSingleDomainFixup(EntityImporter): + """ + Fixup script for bootstrap longtail OA release entities which don't have a + container but are confidently associated with an ISSN-L based on file + domain. + + Expected to be a one-time fixup impacting about 600k entities (around half + the longtail OA batch). + + Reads in a mapping of unique domain-ISSNL mappings, and then iterates over + the original matched import batch file. For each line in the later: + + - checks if in-scope based on domain-ISSNL map + - uses API to lookup file (by SHA-1) and confirm domain in URL list + - look up releases for file and retain the longtail-oa ones (an extra flag) + - if release is longtail-oa and no container, set the container based on + ISSN-L (using cached lookup) + - use EntityImporter stuff to manage update/editgroup queue + """ + + def __init__(self, api, domain_issnl_tsv_file, **kwargs): + + eg_desc = kwargs.pop('editgroup_description', + "Fixup for longtail OA releases that can be matched to specific container by file domain / ISSN-L mapping") + eg_extra = kwargs.pop('editgroup_extra', dict()) + eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.LongtailIssnlSingleDomainFixup') + super().__init__(api, + editgroup_description=eg_desc, + editgroup_extra=eg_extra, + **kwargs) + + self._domain_issnl_map = self.load_domain_issnl(domain_issnl_tsv_file) + self._issnl_container_map = dict() + + def load_domain_issnl(self, tsv_file): + print("Loading domain ISSN-L file...") + m = dict() + for l in tsv_file: + l = l.strip().split('\t') + assert len(l) == 2 + domain = l[0].lower() + issnl = l[1] + assert len(issnl) == 9 and issnl[4] == '-' + m[domain] = issnl + print("Got {} matchings.".format(len(m))) + return m + + def want(self, raw_record): + # do it all in parse_record() + return True + + def parse_record(self, row): + """ + TSV rows: + - sha1 b32 key + - JSON string: CDX-ish + - surt + - url + - <etc> + - mime + - size (?) + - JSON string: grobid metadata + """ + + # parse row + row = row.split('\t') + assert len(row) == 5 + sha1 = b32_hex(row[0][5:]) + cdx_dict = json.loads(row[1]) + url = cdx_dict['url'] + domain = url.split('/')[2].lower() + + if not domain: + self.counts['skip-domain-blank'] += 1 + return None + + # domain in scope? + issnl = self._domain_issnl_map.get(domain) + if not issnl: + self.counts['skip-domain-scope'] += 1 + return None + if 'revistas' in domain.lower().split('.'): + self.counts['skip-domain-revistas'] += 1 + return None + + # lookup file + #print(sha1) + try: + file_entity = self.api.lookup_file(sha1=sha1, expand="releases") + except fatcat_openapi_client.rest.ApiException as err: + if err.status == 404: + self.counts['skip-file-not-found'] += 1 + return None + else: + raise err + + # container ident + container_id = self.lookup_issnl(issnl) + if not container_id: + self.counts['skip-container-not-found'] += 1 + return None + + # confirm domain + url_domain_match = False + for furl in file_entity.urls: + fdomain = furl.url.split('/')[2].lower() + if domain == fdomain: + url_domain_match = True + break + if not url_domain_match: + self.counts['skip-no-domain-match'] += 1 + return None + + # fetch releases + releases = [r for r in file_entity.releases if (r.extra.get('longtail_oa') == True and r.container_id == None)] + if not releases: + #print(file_entity.releases) + self.counts['skip-no-releases'] += 1 + return None + + # fetch full release objects (need abstract, etc, for updating) + releases = [self.api.get_release(r.ident) for r in releases] + + # set container_id + for r in releases: + r.container_id = container_id + return releases + + def try_update(self, re_list): + for re in re_list: + self.api.update_release(self.get_editgroup_id(), re.ident, re) + self.counts['update'] += 1 + return False + + def insert_batch(self, batch): + raise NotImplementedError + +def run_fixup(args): + fmi = LongtailIssnlSingleDomainFixup(args.api, + args.domain_issnl_tsv_file, + edit_batch_size=args.batch_size) + LinePusher(fmi, args.insertable_tsv_file).run() + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('--api-host-url', + default="http://localhost:9411/v0", + help="connect to this host/port") + parser.add_argument('--batch-size', + help="size of batch to send", + default=50, type=int) + parser.add_argument('domain_issnl_tsv_file', + help="domain/ISSNL mapping TSV file", + type=argparse.FileType('r')) + parser.add_argument('insertable_tsv_file', + help="dumpgrobidmetainsertable TSV file to work over", + default=sys.stdin, type=argparse.FileType('r')) + + auth_var = "FATCAT_AUTH_SANDCRAWLER" + + args = parser.parse_args() + + args.api = authenticated_api( + args.api_host_url, + # token is an optional kwarg (can be empty string, None, etc) + token=os.environ.get(auth_var)) + run_fixup(args) + +if __name__ == '__main__': + main() |