diff options
Diffstat (limited to 'extra/fixups')
-rwxr-xr-x | extra/fixups/fixup_longtail_issnl_unique.py | 232 |
1 files changed, 0 insertions, 232 deletions
diff --git a/extra/fixups/fixup_longtail_issnl_unique.py b/extra/fixups/fixup_longtail_issnl_unique.py deleted file mode 100755 index ea615a13..00000000 --- a/extra/fixups/fixup_longtail_issnl_unique.py +++ /dev/null @@ -1,232 +0,0 @@ -#!/usr/bin/env python3 - -""" -This file must be moved to the fatcat:python/ directory (aka, not in -fatcat:extra/fixups) to run. It's a "one-off", so probably will bitrot pretty -quickly. There are no tests. - -Example invocation: - - zcat /srv/fatcat/datasets/2018-09-23-0405.30-dumpgrobidmetainsertable.longtail_join.filtered.tsv.gz | ./fixup_longtail_issnl_unique.py /srv/fatcat/datasets/single_domain_issnl.tsv - - -See also: -- bnewbold/scratch:mellon/201904_longtail_issn.md -- aitio:/rapida/OA-JOURNAL-TESTCRAWL-TWO-2018 -- https://archive.org/details/OA-JOURNAL-TESTCRAWL-TWO-2018-extra -= https://archive.org/download/ia_longtail_dumpgrobidmetainsertable_2018-09-23/2018-09-23-0405.30-dumpgrobidmetainsertable.longtail_join.filtered.tsv.gz - -QA notes: - -- everything on revistas.uv.mx linked to 2395-9495, which is only one journal - on that domain. blacklist 'revistas' in the domain? -- afjg3yjdjbf2dad47t5jq7nlbe => 2305-7254 ok match but not perfect (wrong year - of conference). probably better than nothing. -- elib.mi.sanu.ac.rs has several journals on domain. container_name was correct. -- revistavirtual.ucn.edu.co has 2x journals -- lpchkxkp5jecdgrab33fxodd7y bad match -- k36web33jvf25by64gop4yil7q an IR, not a journal (ok) -- hvxercwasjhotpewb5xfadyyle good match, though only an abstract (in URL). full - articles get DOIs -- release_epkiok6y3zhsnp3no2lkljznza not a paper; journal match batch (cato, wtf) -- release_b3jolh25mbg4djrqotgosyeike jfr.unibo.it good -- release_bzr35evb4bdd3mxex6gxn6dcyy conf.ostis.net good? -- uzspace.uzulu.ac.za IR, not a container -- release_5lt36yy3vre2nnig46toy67kdi wrong, multiple journals -- release_54hmv5gvtjghjk7rpcbp2pn2ky good -- release_6h7doxfaxnao3jm7f6jkfdpdwm good -- release_6pio5hz6bvawfnodhkvmfk4jei correct but stub -- release_7oobqygqczapbgdvvgbxfyvqli correct -- release_tsljmbevpzfpxiezzv7puwbilq good - -general notes: -- GROBID works pretty well. references look pretty good, should match. there is - a non-trivial fraction of non-journal content, but it isn't too bad -- this "single-journal domain" premise doesn't work -- could probably do a subset based on "is the journal name in the domain name", - or "is domain acronym of journal name" -- surprising number of IRs with ISSNs in here -- might have better luck blacklisting out latin american TLDs, which tend to - host many journals? -""" - -import os, sys, argparse -import json -import sqlite3 -import itertools - -import fatcat_openapi_client -from fatcat_tools import authenticated_api -from fatcat_tools.importers.common import EntityImporter, clean, LinePusher -from fatcat_tools.importers.arabesque import b32_hex - - -class LongtailIssnlSingleDomainFixup(EntityImporter): - """ - Fixup script for bootstrap longtail OA release entities which don't have a - container but are confidently associated with an ISSN-L based on file - domain. - - Expected to be a one-time fixup impacting about 600k entities (around half - the longtail OA batch). - - Reads in a mapping of unique domain-ISSNL mappings, and then iterates over - the original matched import batch file. For each line in the later: - - - checks if in-scope based on domain-ISSNL map - - uses API to lookup file (by SHA-1) and confirm domain in URL list - - look up releases for file and retain the longtail-oa ones (an extra flag) - - if release is longtail-oa and no container, set the container based on - ISSN-L (using cached lookup) - - use EntityImporter stuff to manage update/editgroup queue - """ - - def __init__(self, api, domain_issnl_tsv_file, **kwargs): - - eg_desc = kwargs.pop('editgroup_description', - "Fixup for longtail OA releases that can be matched to specific container by file domain / ISSN-L mapping") - eg_extra = kwargs.pop('editgroup_extra', dict()) - eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.LongtailIssnlSingleDomainFixup') - super().__init__(api, - editgroup_description=eg_desc, - editgroup_extra=eg_extra, - **kwargs) - - self._domain_issnl_map = self.load_domain_issnl(domain_issnl_tsv_file) - self._issnl_container_map = dict() - - def load_domain_issnl(self, tsv_file): - print("Loading domain ISSN-L file...") - m = dict() - for l in tsv_file: - l = l.strip().split('\t') - assert len(l) == 2 - domain = l[0].lower() - issnl = l[1] - assert len(issnl) == 9 and issnl[4] == '-' - m[domain] = issnl - print("Got {} matchings.".format(len(m))) - return m - - def want(self, raw_record): - # do it all in parse_record() - return True - - def parse_record(self, row): - """ - TSV rows: - - sha1 b32 key - - JSON string: CDX-ish - - surt - - url - - <etc> - - mime - - size (?) - - JSON string: grobid metadata - """ - - # parse row - row = row.split('\t') - assert len(row) == 5 - sha1 = b32_hex(row[0][5:]) - cdx_dict = json.loads(row[1]) - url = cdx_dict['url'] - domain = url.split('/')[2].lower() - - if not domain: - self.counts['skip-domain-blank'] += 1 - return None - - # domain in scope? - issnl = self._domain_issnl_map.get(domain) - if not issnl: - self.counts['skip-domain-scope'] += 1 - return None - if 'revistas' in domain.lower().split('.'): - self.counts['skip-domain-revistas'] += 1 - return None - - # lookup file - #print(sha1) - try: - file_entity = self.api.lookup_file(sha1=sha1, expand="releases") - except fatcat_openapi_client.rest.ApiException as err: - if err.status == 404: - self.counts['skip-file-not-found'] += 1 - return None - else: - raise err - - # container ident - container_id = self.lookup_issnl(issnl) - if not container_id: - self.counts['skip-container-not-found'] += 1 - return None - - # confirm domain - url_domain_match = False - for furl in file_entity.urls: - fdomain = furl.url.split('/')[2].lower() - if domain == fdomain: - url_domain_match = True - break - if not url_domain_match: - self.counts['skip-no-domain-match'] += 1 - return None - - # fetch releases - releases = [r for r in file_entity.releases if (r.extra.get('longtail_oa') == True and r.container_id == None)] - if not releases: - #print(file_entity.releases) - self.counts['skip-no-releases'] += 1 - return None - - # fetch full release objects (need abstract, etc, for updating) - releases = [self.api.get_release(r.ident) for r in releases] - - # set container_id - for r in releases: - r.container_id = container_id - return releases - - def try_update(self, re_list): - for re in re_list: - self.api.update_release(self.get_editgroup_id(), re.ident, re) - self.counts['update'] += 1 - return False - - def insert_batch(self, batch): - raise NotImplementedError - -def run_fixup(args): - fmi = LongtailIssnlSingleDomainFixup(args.api, - args.domain_issnl_tsv_file, - edit_batch_size=args.batch_size) - LinePusher(fmi, args.insertable_tsv_file).run() - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument('--api-host-url', - default="http://localhost:9411/v0", - help="connect to this host/port") - parser.add_argument('--batch-size', - help="size of batch to send", - default=50, type=int) - parser.add_argument('domain_issnl_tsv_file', - help="domain/ISSNL mapping TSV file", - type=argparse.FileType('r')) - parser.add_argument('insertable_tsv_file', - help="dumpgrobidmetainsertable TSV file to work over", - default=sys.stdin, type=argparse.FileType('r')) - - auth_var = "FATCAT_AUTH_SANDCRAWLER" - - args = parser.parse_args() - - args.api = authenticated_api( - args.api_host_url, - # token is an optional kwarg (can be empty string, None, etc) - token=os.environ.get(auth_var)) - run_fixup(args) - -if __name__ == '__main__': - main() |