import sys import json import sqlite3 import itertools import fatcat_openapi_client from fatcat_tools.normal import * from .common import EntityImporter, make_rel_url, SANE_MAX_RELEASES, SANE_MAX_URLS class ShadowLibraryImporter(EntityImporter): """ Importer for shadow library files (matched to releases) Input format is JSON with keys: - shadow - shadow_corpus (string slug) - shadow_id (string) - doi - pmid - isbn13 - file_meta - sha1hex - sha256hex - md5hex - size_bytes - mimetype - cdx (may be null) - url - datetime """ def __init__(self, api, **kwargs): eg_desc = kwargs.pop('editgroup_description', None) or "Import of 'Shadow Library' file/release matches" eg_extra = kwargs.pop('editgroup_extra', dict()) eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.ShadowLibraryImporter') super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs) self.default_link_rel = kwargs.get("default_link_rel", "web") def want(self, raw_record): """ Only want to import records with complete file-level metadata """ fm = raw_record['file_meta'] if not (fm['mimetype'] and fm['md5hex'] and fm['sha256hex'] and fm['size_bytes']): self.counts['skip-file-meta-incomplete'] += 1 return False if fm['mimetype'] != 'application/pdf': self.counts['skip-not-pdf'] += 1 return False return True def parse_record(self, obj): """ We do the release lookup in this method. Try DOI, then PMID, last ISBN13. """ shadow_corpus = obj['shadow']['shadow_corpus'] assert shadow_corpus == shadow_corpus.strip().lower() doi = clean_doi(obj['shadow'].get('doi')) pmid = clean_pmid(obj['shadow'].get('pmid')) isbn13 = clean_isbn13(obj['shadow'].get('isbn13')) shadow_id = obj['shadow'].get('shadow_id').strip() assert shadow_id extra = { '{}_id'.format(shadow_corpus): shadow_id } for (ext_type, ext_id) in [('doi', doi), ('pmid', pmid), ('isbn13', isbn13)]: if not ext_id: continue extra['{}_{}'.format(shadow_corpus, ext_type)] = ext_id # lookup release via several idents re = None for (ext_type, ext_id) in [('doi', doi), ('pmid', pmid), ('isbn13', isbn13)]: if not ext_id: continue try: re = self.api.lookup_release(**{ext_type: ext_id}) except fatcat_openapi_client.rest.ApiException as err: if err.status not in (404, 400): raise err re = None if re: break if not re: self.counts['skip-release-not-found'] += 1 return None release_ids = [re.ident,] # parse single CDX into URLs (if exists) urls = [] if obj.get('cdx'): url = make_rel_url(obj['cdx']['url'], default_link_rel=self.default_link_rel) if url != None: urls.append(url) wayback = "https://web.archive.org/web/{}/{}".format( obj['cdx']['datetime'], obj['cdx']['url']) urls.append(("webarchive", wayback)) urls = [fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in urls] fe = fatcat_openapi_client.FileEntity( md5=obj['file_meta']['md5hex'], sha1=obj['file_meta']['sha1hex'], sha256=obj['file_meta']['sha256hex'], size=int(obj['file_meta']['size_bytes']), mimetype=obj['file_meta']['mimetype'] or None, release_ids=release_ids, urls=urls, extra=dict(shadows=extra), ) return fe def try_update(self, fe): # lookup sha1, or create new entity existing = None try: existing = self.api.lookup_file(sha1=fe.sha1) except fatcat_openapi_client.rest.ApiException as err: if err.status != 404: raise err if not existing: return True if not existing.extra: existing.extra = {} if existing.extra.get('shadows') and list(fe.extra['shadows'].keys())[0] in existing.extra['shadows']: # already imported from this shadow library; skip self.counts['exists'] += 1 return False # check for edit conflicts if existing.ident in [e.ident for e in self._edits_inflight]: self.counts['skip-update-inflight'] += 1 return False if fe.sha1 in [e.sha1 for e in self._edits_inflight]: raise Exception("Inflight insert; shouldn't happen") # minimum viable "existing" URL cleanup to fix dupes and broken links: # remove 'None' wayback URLs, and set archive.org rel 'archive' existing.urls = [u for u in existing.urls if not ('://web.archive.org/web/None/' in u.url)] for i in range(len(existing.urls)): u = existing.urls[i] if u.rel == 'repository' and '://archive.org/download/' in u.url: existing.urls[i].rel = 'archive' if u.rel == 'social': u.rel = 'academicsocial' # new wayback URLs, could replace bad old short wayback URLs (from arabesque bug) new_wb_urls = [u.url for u in fe.urls] new_short_wb_urls = ['https://web.archive.org/web/{}/{}'.format( u.split('/')[4][:12], '/'.join(u.split('/')[5:])) for u in new_wb_urls] existing.urls = [u for u in existing.urls if not u.url in new_short_wb_urls] # merge the existing into this one and update merged_urls = {} for u in fe.urls + existing.urls: merged_urls[u.url] = u existing.urls = list(merged_urls.values()) if not existing.extra.get('shadows'): existing.extra['shadows'] = fe.extra['shadows'] else: existing.extra['shadows'].update(fe.extra['shadows']) # do these "plus ones" because we really want to do these updates when possible if len(existing.urls) > SANE_MAX_URLS + 1: self.counts['skip-update-too-many-url'] += 1 return None existing.release_ids = list(set(fe.release_ids + existing.release_ids)) if len(existing.release_ids) > SANE_MAX_RELEASES + 1: self.counts['skip-update-too-many-releases'] += 1 return None existing.mimetype = existing.mimetype or fe.mimetype existing.size = existing.size or fe.size existing.md5 = existing.md5 or fe.md5 existing.sha1 = existing.sha1 or fe.sha1 existing.sha256 = existing.sha256 or fe.sha256 edit = self.api.update_file(self.get_editgroup_id(), existing.ident, existing) # add sha1 to non-entity edit row, so we can do more aggressive # group-level de-dupe edit.sha1 = existing.sha1 self._edits_inflight.append(edit) self.counts['update'] += 1 return False def insert_batch(self, batch): self.api.create_file_auto_batch(fatcat_openapi_client.FileAutoBatch( editgroup=fatcat_openapi_client.Editgroup( description=self.editgroup_description, extra=self.editgroup_extra), entity_list=batch))