From fb53198956843954a981dbbe83b4727b25ae6427 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 18 Apr 2019 15:27:41 -0700 Subject: arabesque importer --- python/fatcat_tools/importers/arabesque.py | 165 +++++++++++++++++++++++++++++ 1 file changed, 165 insertions(+) create mode 100644 python/fatcat_tools/importers/arabesque.py (limited to 'python') diff --git a/python/fatcat_tools/importers/arabesque.py b/python/fatcat_tools/importers/arabesque.py new file mode 100644 index 00000000..3e951b73 --- /dev/null +++ b/python/fatcat_tools/importers/arabesque.py @@ -0,0 +1,165 @@ + +import sys +import json +import base64 +import sqlite3 +import itertools +import fatcat_client +from .common import EntityImporter, clean, make_rel_url + + +def b32_hex(s): + s = s.strip().split()[0].lower() + if s.startswith("sha1:"): + s = s[5:] + if len(s) != 32: + return s + return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8') + + +ARABESQUE_MATCH_WHERE_CLAUSE='WHERE hit = 1 AND identifier IS NOT NULL' + +class ArabesqueMatchImporter(EntityImporter): + """ + Importer for arabesque crawl report .sqlite files, which contain + file/identifier matches based on URL/identifier seedlists. + + Uses a SQL query to iterate through table looking for rows with: + + - GROBID status 200 + - known SHA-1 (base32 format) + - known timestamp + + Looks up release (by identifier) and file (by SHA-1). If no release exists, + bail. + + If no file exists, create one from metadata (using both direct and wayback + URLs), link to release, and insert. + + If file exists, optionally update it: + + - if no release match, match to the release + - if new URL not included, add it (and wayback) + + Config options: + - default URL rel + - crawl id (for editgroup metadata) + - identifier type + + TODO: + - a mode to insert bare files even if identifier not known? + """ + + def __init__(self, api, extid_type, require_grobid=True, **kwargs): + + eg_desc = kwargs.get('editgroup_description', + "Match web crawl files to releases based on identifier/URL seedlist") + eg_extra = kwargs.get('editgroup_extra', dict()) + eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.ArabesqueMatchImporter') + if kwargs.get('crawl_id'): + eg_extra['crawl_id'] = eg_extra.get('crawl_id') + super().__init__(api, + editgroup_description=eg_desc, + editgroup_extra=eg_extra, + **kwargs) + assert extid_type in ('doi', 'pmcid', 'pmid') + self.extid_type = extid_type + self.default_link_rel = kwargs.get("default_link_rel", "web") + self.default_mime = kwargs.get("default_mime", None) + self.do_updates = kwargs.get("do_updates", False) + self.require_grobid = require_grobid + if self.require_grobid: + print("Requiring GROBID status == 200") + else: + print("NOT checking GROBID status column") + + def want(self, row): + if self.require_grobid and not row['postproc_status'] == "200": + return False + if (row['hit'] == True + and row['final_sha1'] + and row['final_timestamp'] and row['final_timestamp'] != "-" + and row['final_mimetype'] + and row['hit'] == True + and row['identifier']): + return True + else: + return False + + def parse_record(self, row): + + extid = row['identifier'].strip() + + # check/cleanup DOI + if self.extid_type == 'doi': + self.extid_type.replace('http://doi.org/', '') + self.extid_type.replace('https://doi.org/', '') + if not extid.startswith('10.'): + self.counts['skip-bad-doi'] + return None + + # lookup extid + try: + re = self.api.lookup_release(**{self.extid_type: extid}) + except fatcat_client.rest.ApiException as err: + if err.status != 404: + raise err + # bail on 404 (release not in DB) + self.counts['skip-extid-not-found'] += 1 + return None + + url = make_rel_url(row['final_url'], self.default_link_rel) + if not url: + self.counts['skip-url'] += 1 + return None + wayback = "https://web.archive.org/web/{}/{}".format( + row['final_timestamp'], + row['final_url']) + urls = [url, ("webarchive", wayback)] + + urls = [fatcat_client.FileEntityUrls(rel=rel, url=url) for (rel, url) in urls] + + fe = fatcat_client.FileEntity( + sha1=b32_hex(row['final_sha1']), + mimetype=row['final_mimetype'], + release_ids=[re.ident], + urls=urls, + ) + return fe + + def try_update(self, fe): + # lookup sha1, or create new entity + existing = None + try: + existing = self.api.lookup_file(sha1=fe.sha1) + except fatcat_client.rest.ApiException as err: + if err.status != 404: + raise err + + if not existing: + return True + + if (fe.release_ids[0] in existing.release_ids) and existing.urls: + # TODO: could still, in theory update with the new URL? + self.counts['exists'] += 1 + return False + + if not self.do_updates: + self.counts['update-disabled'] += 1 + return False + + # merge the existing into this one and update + existing.urls = list(set([(u.rel, u.url) for u in fe.urls + existing.urls])) + existing.urls = [fatcat_client.FileEntityUrls(rel=rel, url=url) for (rel, url) in existing.urls] + existing.release_ids = list(set(fe.release_ids + existing.release_ids)) + existing.mimetype = existing.mimetype or fe.mimetype + self.api.update_file(existing.ident, existing, editgroup_id=self.get_editgroup_id()) + self.counts['update'] += 1 + return False + + def insert_batch(self, batch): + self.api.create_file_batch(batch, + autoaccept=True, + description=self.editgroup_description, + extra=json.dumps(self.editgroup_extra)) + -- cgit v1.2.3