diff options
Diffstat (limited to 'python/fatcat_tools')
| -rw-r--r-- | python/fatcat_tools/importers/__init__.py | 1 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/shadow.py | 195 | 
2 files changed, 196 insertions, 0 deletions
diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py index d936605f..10557ef8 100644 --- a/python/fatcat_tools/importers/__init__.py +++ b/python/fatcat_tools/importers/__init__.py @@ -28,3 +28,4 @@ from .arabesque import ArabesqueMatchImporter, ARABESQUE_MATCH_WHERE_CLAUSE  from .wayback_static import auto_wayback_static  from .cdl_dash_dat import auto_cdl_dash_dat  from .ingest import IngestFileResultImporter, SavePaperNowFileImporter +from .shadow import ShadowLibraryImporter diff --git a/python/fatcat_tools/importers/shadow.py b/python/fatcat_tools/importers/shadow.py new file mode 100644 index 00000000..4cd22775 --- /dev/null +++ b/python/fatcat_tools/importers/shadow.py @@ -0,0 +1,195 @@ + +import sys +import json +import sqlite3 +import itertools +import fatcat_openapi_client + +from fatcat_tools.normal import * +from .common import EntityImporter, make_rel_url, SANE_MAX_RELEASES, SANE_MAX_URLS + + +class ShadowLibraryImporter(EntityImporter): +    """ +    Importer for shadow library files (matched to releases) + +    Input format is JSON with keys: +    - shadow +        - shadow_corpus (string slug) +        - shadow_id (string) +        - doi +        - pmid +        - isbn13 +    - file_meta +        - sha1hex +        - sha256hex +        - md5hex +        - size_bytes +        - mimetype +    - cdx (may be null) +        - url +        - datetime +    """ + +    def __init__(self, api, **kwargs): + +        eg_desc = kwargs.pop('editgroup_description', None) or "Import of 'Shadow Library' file/release matches" +        eg_extra = kwargs.pop('editgroup_extra', dict()) +        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.ShadowLibraryImporter') +        super().__init__(api, +            editgroup_description=eg_desc, +            editgroup_extra=eg_extra, +            **kwargs) +        self.default_link_rel = kwargs.get("default_link_rel", "web") + +    def want(self, raw_record): +        """ +        Only want to import records with complete file-level metadata +        """ +        fm = raw_record['file_meta'] +        if not (fm['mimetype'] and fm['md5hex'] and fm['sha256hex'] and fm['size_bytes']): +            self.counts['skip-file-meta-incomplete'] += 1 +            return False +        if fm['mimetype'] != 'application/pdf': +            self.counts['skip-not-pdf'] += 1 +            return False +        return True + +    def parse_record(self, obj): +        """ +        We do the release lookup in this method. Try DOI, then PMID, last ISBN13. +        """ + +        shadow_corpus = obj['shadow']['shadow_corpus'] +        assert shadow_corpus == shadow_corpus.strip().lower() +        doi = clean_doi(obj['shadow'].get('doi')) +        pmid = clean_pmid(obj['shadow'].get('pmid')) +        isbn13 = clean_isbn13(obj['shadow'].get('isbn13')) +        shadow_id = obj['shadow'].get('shadow_id').strip() +        assert shadow_id + +        extra = { '{}_id'.format(shadow_corpus): shadow_id } +        for (ext_type, ext_id) in [('doi', doi), ('pmid', pmid), ('isbn13', isbn13)]: +            if not ext_id: +                continue +            extra['{}_{}'.format(shadow_corpus, ext_type)] = ext_id + +        # lookup release via several idents +        re = None +        for (ext_type, ext_id) in [('doi', doi), ('pmid', pmid), ('isbn13', isbn13)]: +            if not ext_id: +                continue +            try: +                re = self.api.lookup_release(**{ext_type: ext_id}) +            except fatcat_openapi_client.rest.ApiException as err: +                if err.status not in (404, 400): +                    raise err +                re = None +            if re: +                break + +        if not re: +            self.counts['skip-release-not-found'] += 1 +            return None + +        release_ids = [re.ident,] + +        # parse single CDX into URLs (if exists) +        urls = [] +        if obj.get('cdx'): +            url = make_rel_url(obj['cdx']['url'], default_link_rel=self.default_link_rel) +            if url != None: +                urls.append(url) +            wayback = "https://web.archive.org/web/{}/{}".format( +                obj['cdx']['datetime'], +                obj['cdx']['url']) +            urls.append(("webarchive", wayback)) +        urls = [fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in urls] + +        fe = fatcat_openapi_client.FileEntity( +            md5=obj['file_meta']['md5hex'], +            sha1=obj['file_meta']['sha1hex'], +            sha256=obj['file_meta']['sha256hex'], +            size=int(obj['file_meta']['size_bytes']), +            mimetype=obj['file_meta']['mimetype'] or None, +            release_ids=release_ids, +            urls=urls, +            extra=dict(shadows=extra), +        ) +        return fe + +    def try_update(self, fe): +        # lookup sha1, or create new entity +        existing = None +        try: +            existing = self.api.lookup_file(sha1=fe.sha1) +        except fatcat_openapi_client.rest.ApiException as err: +            if err.status != 404: +                raise err + +        if not existing: +            return True + +        if not existing.extra: +            existing.extra = {} + +        if existing.extra.get('shadows') and list(fe.extra['shadows'].keys())[0] in existing.extra['shadows']: +            # already imported from this shadow library; skip +            self.counts['exists'] += 1 +            return False + +        # check for edit conflicts +        if existing.ident in [e.ident for e in self._edits_inflight]: +            self.counts['skip-update-inflight'] += 1 +            return False +        if fe.sha1 in [e.sha1 for e in self._edits_inflight]: +            raise Exception("Inflight insert; shouldn't happen") + +        # minimum viable "existing" URL cleanup to fix dupes and broken links: +        # remove 'None' wayback URLs, and set archive.org rel 'archive' +        existing.urls = [u for u in existing.urls if not ('://web.archive.org/web/None/' in u.url)] +        for i in range(len(existing.urls)): +            u = existing.urls[i] +            if u.rel == 'repository' and '://archive.org/download/' in u.url: +                existing.urls[i].rel = 'archive' +            if u.rel == 'social': +                u.rel = 'academicsocial' + +        # merge the existing into this one and update +        merged_urls = {} +        for u in fe.urls + existing.urls: +            merged_urls[u.url] = u +        existing.urls = list(merged_urls.values()) +        if not existing.extra.get('shadows'): +            existing.extra['shadows'] = fe.extra['shadows'] +        else: +            existing.extra['shadows'].update(fe.extra['shadows']) + +        # do these "plus ones" because we really want to do these updates when possible +        if len(existing.urls) > SANE_MAX_URLS + 1: +            self.counts['skip-update-too-many-url'] += 1 +            return None +        existing.release_ids = list(set(fe.release_ids + existing.release_ids)) +        if len(existing.release_ids) > SANE_MAX_RELEASES + 1: +            self.counts['skip-update-too-many-releases'] += 1 +            return None +        existing.mimetype = existing.mimetype or fe.mimetype +        existing.size = existing.size or fe.size +        existing.md5 = existing.md5 or fe.md5 +        existing.sha1 = existing.sha1 or fe.sha1 +        existing.sha256 = existing.sha256 or fe.sha256 +        edit = self.api.update_file(self.get_editgroup_id(), existing.ident, existing) +        # add sha1 to non-entity edit row, so we can do more aggressive +        # group-level de-dupe +        edit.sha1 = existing.sha1 +        self._edits_inflight.append(edit) +        self.counts['update'] += 1 +        return False + +    def insert_batch(self, batch): +        self.api.create_file_auto_batch(fatcat_openapi_client.FileAutoBatch( +            editgroup=fatcat_openapi_client.Editgroup( +                description=self.editgroup_description, +                extra=self.editgroup_extra), +            entity_list=batch)) +  | 
