clean up extra/ folder a bit

author: Bryan Newbold <bnewbold@robocracy.org> 2021-11-29 15:02:27 -0800
committer: Bryan Newbold <bnewbold@robocracy.org> 2021-11-29 15:02:27 -0800
commit: 7c6afa0a21883dc8037f3d021246db24eef39b41 (patch)
tree: 3fa7c1e595248a46e88ea62c2f9f70106186b0fe /extra/fixups
parent: c32154f2875a7fb9aac727013e1475cdd811e180 (diff)
download: fatcat-7c6afa0a21883dc8037f3d021246db24eef39b41.tar.gz
fatcat-7c6afa0a21883dc8037f3d021246db24eef39b41.zip
1 files changed, 0 insertions, 232 deletions
diff --git a/extra/fixups/fixup_longtail_issnl_unique.py b/extra/fixups/fixup_longtail_issnl_unique.py
deleted file mode 100755
index ea615a13..00000000
--- a/extra/fixups/fixup_longtail_issnl_unique.py
+++ /dev/null
@@ -1,232 +0,0 @@
-#!/usr/bin/env python3
-
-"""
-This file must be moved to the fatcat:python/ directory (aka, not in
-fatcat:extra/fixups) to run. It's a "one-off", so probably will bitrot pretty
-quickly. There are no tests.
-
-Example invocation:
-
-    zcat /srv/fatcat/datasets/2018-09-23-0405.30-dumpgrobidmetainsertable.longtail_join.filtered.tsv.gz | ./fixup_longtail_issnl_unique.py /srv/fatcat/datasets/single_domain_issnl.tsv -
-
-See also:
-- bnewbold/scratch:mellon/201904_longtail_issn.md
-- aitio:/rapida/OA-JOURNAL-TESTCRAWL-TWO-2018
-- https://archive.org/details/OA-JOURNAL-TESTCRAWL-TWO-2018-extra
-= https://archive.org/download/ia_longtail_dumpgrobidmetainsertable_2018-09-23/2018-09-23-0405.30-dumpgrobidmetainsertable.longtail_join.filtered.tsv.gz
-
-QA notes:
-
-- everything on revistas.uv.mx linked to 2395-9495, which is only one journal
-  on that domain. blacklist 'revistas' in the domain?
-- afjg3yjdjbf2dad47t5jq7nlbe => 2305-7254 ok match but not perfect (wrong year
-  of conference). probably better than nothing. 
-- elib.mi.sanu.ac.rs has several journals on domain. container_name was correct.
-- revistavirtual.ucn.edu.co has 2x journals
-- lpchkxkp5jecdgrab33fxodd7y bad match
-- k36web33jvf25by64gop4yil7q an IR, not a journal (ok)
-- hvxercwasjhotpewb5xfadyyle good match, though only an abstract (in URL). full
-  articles get DOIs
-- release_epkiok6y3zhsnp3no2lkljznza not a paper; journal match batch (cato, wtf)
-- release_b3jolh25mbg4djrqotgosyeike jfr.unibo.it good
-- release_bzr35evb4bdd3mxex6gxn6dcyy conf.ostis.net good?
-- uzspace.uzulu.ac.za IR, not a container
-- release_5lt36yy3vre2nnig46toy67kdi wrong, multiple journals
-- release_54hmv5gvtjghjk7rpcbp2pn2ky good
-- release_6h7doxfaxnao3jm7f6jkfdpdwm good
-- release_6pio5hz6bvawfnodhkvmfk4jei correct but stub
-- release_7oobqygqczapbgdvvgbxfyvqli correct
-- release_tsljmbevpzfpxiezzv7puwbilq good
-
-general notes:
-- GROBID works pretty well. references look pretty good, should match. there is
-  a non-trivial fraction of non-journal content, but it isn't too bad
-- this "single-journal domain" premise doesn't work
-- could probably do a subset based on "is the journal name in the domain name",
-  or "is domain acronym of journal name"
-- surprising number of IRs with ISSNs in here
-- might have better luck blacklisting out latin american TLDs, which tend to
-  host many journals?
-"""
-
-import os, sys, argparse
-import json
-import sqlite3
-import itertools
-
-import fatcat_openapi_client
-from fatcat_tools import authenticated_api
-from fatcat_tools.importers.common import EntityImporter, clean, LinePusher
-from fatcat_tools.importers.arabesque import b32_hex
-
-
-class LongtailIssnlSingleDomainFixup(EntityImporter):
-    """
-    Fixup script for bootstrap longtail OA release entities which don't have a
-    container but are confidently associated with an ISSN-L based on file
-    domain.
-
-    Expected to be a one-time fixup impacting about 600k entities (around half
-    the longtail OA batch).
-
-    Reads in a mapping of unique domain-ISSNL mappings, and then iterates over
-    the original matched import batch file. For each line in the later:
-    
-    - checks if in-scope based on domain-ISSNL map
-    - uses API to lookup file (by SHA-1) and confirm domain in URL list
-    - look up releases for file and retain the longtail-oa ones (an extra flag)
-    - if release is longtail-oa and no container, set the container based on
-      ISSN-L (using cached lookup)
-    - use EntityImporter stuff to manage update/editgroup queue
-    """
-
-    def __init__(self, api, domain_issnl_tsv_file, **kwargs):
-
-        eg_desc = kwargs.pop('editgroup_description',
-            "Fixup for longtail OA releases that can be matched to specific container by file domain / ISSN-L mapping")
-        eg_extra = kwargs.pop('editgroup_extra', dict())
-        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.LongtailIssnlSingleDomainFixup')
-        super().__init__(api,
-            editgroup_description=eg_desc,
-            editgroup_extra=eg_extra,
-            **kwargs)
-
-        self._domain_issnl_map = self.load_domain_issnl(domain_issnl_tsv_file)
-        self._issnl_container_map = dict()
-
-    def load_domain_issnl(self, tsv_file):
-        print("Loading domain ISSN-L file...")
-        m = dict()
-        for l in tsv_file:
-            l = l.strip().split('\t')
-            assert len(l) == 2
-            domain = l[0].lower()
-            issnl = l[1]
-            assert len(issnl) == 9 and issnl[4] == '-'
-            m[domain] = issnl
-        print("Got {} matchings.".format(len(m)))
-        return m
-
-    def want(self, raw_record):
-        # do it all in parse_record()
-        return True
-
-    def parse_record(self, row):
-        """
-        TSV rows:
-        - sha1 b32 key
-        - JSON string: CDX-ish
-            - surt
-            - url
-            - <etc>
-        - mime
-        - size (?)
-        - JSON string: grobid metadata
-        """
-
-        # parse row
-        row = row.split('\t')
-        assert len(row) == 5
-        sha1 = b32_hex(row[0][5:])
-        cdx_dict = json.loads(row[1])
-        url = cdx_dict['url']
-        domain = url.split('/')[2].lower()
-
-        if not domain:
-            self.counts['skip-domain-blank'] += 1
-            return None
-
-        # domain in scope?
-        issnl = self._domain_issnl_map.get(domain)
-        if not issnl:
-            self.counts['skip-domain-scope'] += 1
-            return None
-        if 'revistas' in domain.lower().split('.'):
-            self.counts['skip-domain-revistas'] += 1
-            return None
-
-        # lookup file
-        #print(sha1)
-        try:
-            file_entity = self.api.lookup_file(sha1=sha1, expand="releases")
-        except fatcat_openapi_client.rest.ApiException as err:
-            if err.status == 404:
-                self.counts['skip-file-not-found'] += 1
-                return None
-            else:
-                raise err
-
-        # container ident
-        container_id = self.lookup_issnl(issnl)
-        if not container_id:
-            self.counts['skip-container-not-found'] += 1
-            return None
-
-        # confirm domain
-        url_domain_match = False
-        for furl in file_entity.urls:
-            fdomain = furl.url.split('/')[2].lower()
-            if domain == fdomain:
-                url_domain_match = True
-                break
-        if not url_domain_match:
-            self.counts['skip-no-domain-match'] += 1
-            return None
-
-        # fetch releases
-        releases = [r for r in file_entity.releases if (r.extra.get('longtail_oa') == True and r.container_id == None)]
-        if not releases:
-            #print(file_entity.releases)
-            self.counts['skip-no-releases'] += 1
-            return None
-
-        # fetch full release objects (need abstract, etc, for updating)
-        releases = [self.api.get_release(r.ident) for r in releases]
-
-        # set container_id
-        for r in releases:
-            r.container_id = container_id
-        return releases
-
-    def try_update(self, re_list):
-        for re in re_list:
-            self.api.update_release(self.get_editgroup_id(), re.ident, re)
-            self.counts['update'] += 1
-        return False
-
-    def insert_batch(self, batch):
-        raise NotImplementedError
-
-def run_fixup(args):
-    fmi = LongtailIssnlSingleDomainFixup(args.api,
-        args.domain_issnl_tsv_file,
-        edit_batch_size=args.batch_size)
-    LinePusher(fmi, args.insertable_tsv_file).run()
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--api-host-url',
-        default="http://localhost:9411/v0",
-        help="connect to this host/port")
-    parser.add_argument('--batch-size',
-        help="size of batch to send",
-        default=50, type=int)
-    parser.add_argument('domain_issnl_tsv_file',
-        help="domain/ISSNL mapping TSV file",
-        type=argparse.FileType('r'))
-    parser.add_argument('insertable_tsv_file',
-        help="dumpgrobidmetainsertable TSV file to work over",
-        default=sys.stdin, type=argparse.FileType('r'))
-
-    auth_var = "FATCAT_AUTH_SANDCRAWLER"
-
-    args = parser.parse_args()
-
-    args.api = authenticated_api(
-        args.api_host_url,
-        # token is an optional kwarg (can be empty string, None, etc)
-        token=os.environ.get(auth_var))
-    run_fixup(args)
-
-if __name__ == '__main__':
-    main()
author	Bryan Newbold <bnewbold@robocracy.org>	2021-11-29 15:02:27 -0800
committer	Bryan Newbold <bnewbold@robocracy.org>	2021-11-29 15:02:27 -0800
commit	7c6afa0a21883dc8037f3d021246db24eef39b41 (patch)
tree	3fa7c1e595248a46e88ea62c2f9f70106186b0fe /extra/fixups
parent	c32154f2875a7fb9aac727013e1475cdd811e180 (diff)
download	fatcat-7c6afa0a21883dc8037f3d021246db24eef39b41.tar.gz fatcat-7c6afa0a21883dc8037f3d021246db24eef39b41.zip