diff options
| author | Bryan Newbold <bnewbold@robocracy.org> | 2019-04-18 15:27:41 -0700 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@robocracy.org> | 2019-04-18 15:27:41 -0700 | 
| commit | fb53198956843954a981dbbe83b4727b25ae6427 (patch) | |
| tree | ab16fda4e3738b72b54596ecfd37ed52fb601cbf /python | |
| parent | 873a167634167838568ad9303ff29673339a4641 (diff) | |
| download | fatcat-fb53198956843954a981dbbe83b4727b25ae6427.tar.gz fatcat-fb53198956843954a981dbbe83b4727b25ae6427.zip | |
arabesque importer
Diffstat (limited to 'python')
| -rw-r--r-- | python/fatcat_tools/importers/arabesque.py | 165 | 
1 files changed, 165 insertions, 0 deletions
| diff --git a/python/fatcat_tools/importers/arabesque.py b/python/fatcat_tools/importers/arabesque.py new file mode 100644 index 00000000..3e951b73 --- /dev/null +++ b/python/fatcat_tools/importers/arabesque.py @@ -0,0 +1,165 @@ + +import sys +import json +import base64 +import sqlite3 +import itertools +import fatcat_client +from .common import EntityImporter, clean, make_rel_url + + +def b32_hex(s): +    s = s.strip().split()[0].lower() +    if s.startswith("sha1:"): +        s = s[5:] +    if len(s) != 32: +        return s +    return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8') + + +ARABESQUE_MATCH_WHERE_CLAUSE='WHERE hit = 1 AND identifier IS NOT NULL' + +class ArabesqueMatchImporter(EntityImporter): +    """ +    Importer for arabesque crawl report .sqlite files, which contain +    file/identifier matches based on URL/identifier seedlists. + +    Uses a SQL query to iterate through table looking for rows with: + +    - GROBID status 200 +    - known SHA-1 (base32 format) +    - known timestamp + +    Looks up release (by identifier) and file (by SHA-1). If no release exists, +    bail. + +    If no file exists, create one from metadata (using both direct and wayback +    URLs), link to release, and insert. + +    If file exists, optionally update it: + +    - if no release match, match to the release +    - if new URL not included, add it (and wayback) + +    Config options: +    - default URL rel +    - crawl id (for editgroup metadata) +    - identifier type + +    TODO: +    - a mode to insert bare files even if identifier not known? +    """ + +    def __init__(self, api, extid_type, require_grobid=True, **kwargs): + +        eg_desc = kwargs.get('editgroup_description', +            "Match web crawl files to releases based on identifier/URL seedlist") +        eg_extra = kwargs.get('editgroup_extra', dict()) +        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.ArabesqueMatchImporter') +        if kwargs.get('crawl_id'): +            eg_extra['crawl_id'] = eg_extra.get('crawl_id') +        super().__init__(api, +            editgroup_description=eg_desc, +            editgroup_extra=eg_extra, +            **kwargs) +        assert extid_type in ('doi', 'pmcid', 'pmid') +        self.extid_type = extid_type +        self.default_link_rel = kwargs.get("default_link_rel", "web") +        self.default_mime = kwargs.get("default_mime", None) +        self.do_updates = kwargs.get("do_updates", False) +        self.require_grobid = require_grobid +        if self.require_grobid: +            print("Requiring GROBID status == 200") +        else: +            print("NOT checking GROBID status column") + +    def want(self, row): +        if self.require_grobid and not row['postproc_status'] == "200": +            return False +        if (row['hit'] == True +                and row['final_sha1'] +                and row['final_timestamp'] and row['final_timestamp'] != "-" +                and row['final_mimetype'] +                and row['hit'] == True +                and row['identifier']): +            return True +        else: +            return False + +    def parse_record(self, row): + +        extid = row['identifier'].strip() + +        # check/cleanup DOI +        if self.extid_type == 'doi': +            self.extid_type.replace('http://doi.org/', '') +            self.extid_type.replace('https://doi.org/', '') +            if not extid.startswith('10.'): +                self.counts['skip-bad-doi'] +                return None + +        # lookup extid +        try: +            re = self.api.lookup_release(**{self.extid_type: extid}) +        except fatcat_client.rest.ApiException as err: +            if err.status != 404: +                raise err +            # bail on 404 (release not in DB) +            self.counts['skip-extid-not-found'] += 1 +            return None + +        url = make_rel_url(row['final_url'], self.default_link_rel) +        if not url: +            self.counts['skip-url'] += 1 +            return None +        wayback = "https://web.archive.org/web/{}/{}".format( +            row['final_timestamp'], +            row['final_url']) +        urls = [url, ("webarchive", wayback)] + +        urls = [fatcat_client.FileEntityUrls(rel=rel, url=url) for (rel, url) in urls] + +        fe = fatcat_client.FileEntity( +            sha1=b32_hex(row['final_sha1']), +            mimetype=row['final_mimetype'], +            release_ids=[re.ident], +            urls=urls, +        ) +        return fe + +    def try_update(self, fe): +        # lookup sha1, or create new entity +        existing = None +        try: +            existing = self.api.lookup_file(sha1=fe.sha1) +        except fatcat_client.rest.ApiException as err: +            if err.status != 404: +                raise err + +        if not existing: +            return True + +        if (fe.release_ids[0] in existing.release_ids) and existing.urls: +            # TODO: could still, in theory update with the new URL? +            self.counts['exists'] += 1 +            return False + +        if not self.do_updates: +            self.counts['update-disabled'] += 1 +            return False + +        # merge the existing into this one and update +        existing.urls = list(set([(u.rel, u.url) for u in fe.urls + existing.urls])) +        existing.urls = [fatcat_client.FileEntityUrls(rel=rel, url=url) for (rel, url) in existing.urls] +        existing.release_ids = list(set(fe.release_ids + existing.release_ids)) +        existing.mimetype = existing.mimetype or fe.mimetype +        self.api.update_file(existing.ident, existing, editgroup_id=self.get_editgroup_id()) +        self.counts['update'] += 1 +        return False + +    def insert_batch(self, batch): +        self.api.create_file_batch(batch, +            autoaccept=True, +            description=self.editgroup_description, +            extra=json.dumps(self.editgroup_extra)) + | 
