From a4db9ee2e18a18b23eb7ece484f95914421f877d Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 13 Nov 2019 00:27:48 -0800 Subject: ingest file result importer --- python/fatcat_tools/importers/__init__.py | 3 +- python/fatcat_tools/importers/ingest.py | 134 ++++++++++++++++++++++++++++++ 2 files changed, 135 insertions(+), 2 deletions(-) create mode 100644 python/fatcat_tools/importers/ingest.py (limited to 'python/fatcat_tools') diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py index 5e2948f4..025a111c 100644 --- a/python/fatcat_tools/importers/__init__.py +++ b/python/fatcat_tools/importers/__init__.py @@ -26,5 +26,4 @@ from .orcid import OrcidImporter from .arabesque import ArabesqueMatchImporter, ARABESQUE_MATCH_WHERE_CLAUSE from .wayback_static import auto_wayback_static from .cdl_dash_dat import auto_cdl_dash_dat -#from .kafka_source import KafkaSource -#from .file_source import FileSource +from .ingest import IngestFileResultImporter diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py new file mode 100644 index 00000000..9e75c26f --- /dev/null +++ b/python/fatcat_tools/importers/ingest.py @@ -0,0 +1,134 @@ + +import sys +import json +import base64 +import itertools +import fatcat_openapi_client +from .common import EntityImporter, clean, make_rel_url, SANE_MAX_RELEASES, SANE_MAX_URLS, b32_hex + + +class IngestFileResultImporter(EntityImporter): + + def __init__(self, api, require_grobid=True, **kwargs): + + eg_desc = kwargs.get('editgroup_description', + "Files crawled from web using sandcrawler ingest tool") + eg_extra = kwargs.get('editgroup_extra', dict()) + eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.IngestFileResultImporter') + super().__init__(api, + editgroup_description=eg_desc, + editgroup_extra=eg_extra, + **kwargs) + self.default_link_rel = kwargs.get("default_link_rel", "web") + assert self.default_link_rel + self.default_mimetype = kwargs.get("default_mimetype", None) + self.do_updates = kwargs.get("do_updates", False) + self.require_grobid = require_grobid + if self.require_grobid: + print("Requiring GROBID status == 200") + else: + print("NOT checking GROBID success") + + def want(self, row): + if self.require_grobid and not row.get('grobid', {}).get('status_code') == 200: + return False + if row.get('hit') == True and row.get('file_meta'): + return True + else: + return False + + def parse_record(self, row): + + request = row['request'] + fatcat = request.get('fatcat') + file_meta = row['file_meta'] + + # identify release by fatcat ident or extid lookup + release_ident = None + if fatcat and fatcat.get('release_ident'): + release_ident = fatcat.get('release_ident') + elif request.get('ext_ids'): + # if no fatcat ident, try extids + for extid_type in ('doi', 'pmid', 'pmcid', 'arxiv'): + extid = request['ext_ids'].get(extid_type) + if not extid: + continue + try: + release = self.api.lookup_release(**{extid_type: extid}) + except fatcat_openapi_client.rest.ApiException as err: + if err.status == 404: + continue + elif err.status == 400: + self.counts['warn-extid-invalid'] += 1 + continue + release_ident = release.ident + break + + if not release: + self.counts['skip-release-not-found'] += 1 + + cdx = row.get('cdx') + if not cdx: + return None + + url = make_rel_url(cdx['url'], self.default_link_rel) + + if not url: + self.counts['skip-url'] += 1 + return None + wayback = "https://web.archive.org/web/{}/{}".format( + cdx['datetime'], + cdx['url']) + urls = [url, ("webarchive", wayback)] + + urls = [fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in urls] + + fe = fatcat_openapi_client.FileEntity( + md5=file_meta['md5hex'], + sha1=file_meta['sha1hex'], + sha256=file_meta['sha256hex'], + size=file_meta['size_bytes'], + mimetype=file_meta['mimetype'] or self.default_mimetype, + release_ids=[release_ident], + urls=urls, + ) + if fatcat and fatcat.get('edit_extra'): + fe.edit_extra = fatcat['edit_extra'] + if request.get('project'): + if not fe.edit_extra: + fe.edit_extra = dict() + fe.edit_extra['project'] = request['project'] + return fe + + def try_update(self, fe): + # lookup sha1, or create new entity + existing = None + try: + existing = self.api.lookup_file(sha1=fe.sha1) + except fatcat_openapi_client.rest.ApiException as err: + if err.status != 404: + raise err + + if not existing: + return True + + if (fe.release_ids[0] in existing.release_ids) and existing.urls: + # TODO: could still, in theory update with the new URL? + self.counts['exists'] += 1 + return False + + if not self.do_updates: + self.counts['skip-update-disabled'] += 1 + return False + + # TODO: for now, never update + self.counts['skip-update-disabled'] += 1 + return False + + def insert_batch(self, batch): + self.api.create_file_auto_batch(fatcat_openapi_client.FileAutoBatch( + editgroup=fatcat_openapi_client.Editgroup( + description=self.editgroup_description, + extra=self.editgroup_extra), + entity_list=batch)) + -- cgit v1.2.3