diff options
Diffstat (limited to 'python/fatcat_tools/importers')
| -rw-r--r-- | python/fatcat_tools/importers/common.py | 137 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/crossref.py | 272 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/grobid_metadata.py | 168 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/issn.py | 72 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/matched.py | 144 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/orcid.py | 73 | 
6 files changed, 866 insertions, 0 deletions
| diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py new file mode 100644 index 00000000..8dfee875 --- /dev/null +++ b/python/fatcat_tools/importers/common.py @@ -0,0 +1,137 @@ + +import re +import sys +import csv +import json +import itertools +import fatcat_client +from fatcat_client.rest import ApiException + +# from: https://docs.python.org/3/library/itertools.html +def grouper(iterable, n, fillvalue=None): +    "Collect data into fixed-length chunks or blocks" +    args = [iter(iterable)] * n +    return itertools.zip_longest(*args, fillvalue=fillvalue) + +class FatcatImporter: + +    def __init__(self, host_url, issn_map_file=None): +        conf = fatcat_client.Configuration() +        conf.host = host_url +        self.api = fatcat_client.DefaultApi(fatcat_client.ApiClient(conf)) +        self._issnl_id_map = dict() +        self._orcid_id_map = dict() +        self._doi_id_map = dict() +        self._issn_issnl_map = None +        self._orcid_regex = re.compile("^\\d{4}-\\d{4}-\\d{4}-\\d{3}[\\dX]$") +        if issn_map_file: +            self.read_issn_map_file(issn_map_file) +        self.processed_lines = 0 +        self.insert_count = 0 +        self.update_count = 0 + +    def describe_run(self): +        print("Processed {} lines, inserted {}, updated {}.".format( +            self.processed_lines, self.insert_count, self.update_count)) + +    def process_source(self, source, group_size=100): +        """Creates and auto-accepts editgroup every group_size rows""" +        eg = self.api.create_editgroup( +            fatcat_client.Editgroup(editor_id='aaaaaaaaaaaabkvkaaaaaaaaae')) +        for i, row in enumerate(source): +            self.create_row(row, editgroup=eg.id) +            if i > 0 and (i % group_size) == 0: +                self.api.accept_editgroup(eg.id) +                eg = self.api.create_editgroup( +                    fatcat_client.Editgroup(editor_id='aaaaaaaaaaaabkvkaaaaaaaaae')) +            self.processed_lines = self.processed_lines + 1 +        if i == 0 or (i % group_size) != 0: +            self.api.accept_editgroup(eg.id) + +    def process_batch(self, source, size=50): +        """Reads and processes in batches (not API-call-per-)""" +        for rows in grouper(source, size): +            self.processed_lines = self.processed_lines + len(rows) +            eg = self.api.create_editgroup( +                fatcat_client.Editgroup(editor_id='aaaaaaaaaaaabkvkaaaaaaaaae')) +            self.create_batch(rows, editgroup=eg.id) + +    def process_csv_source(self, source, group_size=100, delimiter=','): +        reader = csv.DictReader(source, delimiter=delimiter) +        self.process_source(reader, group_size) + +    def process_csv_batch(self, source, size=50, delimiter=','): +        reader = csv.DictReader(source, delimiter=delimiter) +        self.process_batch(reader, size) + +    def is_issnl(self, issnl): +        return len(issnl) == 9 and issnl[4] == '-' + +    def lookup_issnl(self, issnl): +        """Caches calls to the ISSN-L lookup API endpoint in a local dict""" +        if issnl in self._issnl_id_map: +            return self._issnl_id_map[issnl] +        container_id = None +        try: +            rv = self.api.lookup_container(issnl=issnl) +            container_id = rv.ident +        except ApiException as ae: +            # If anything other than a 404 (not found), something is wrong +            assert ae.status == 404 +        self._issnl_id_map[issnl] = container_id # might be None +        return container_id + +    def is_orcid(self, orcid): +        return self._orcid_regex.match(orcid) != None + +    def lookup_orcid(self, orcid): +        """Caches calls to the Orcid lookup API endpoint in a local dict""" +        if not self.is_orcid(orcid): +            return None +        if orcid in self._orcid_id_map: +            return self._orcid_id_map[orcid] +        creator_id = None +        try: +            rv = self.api.lookup_creator(orcid=orcid) +            creator_id = rv.ident +        except ApiException as ae: +            # If anything other than a 404 (not found), something is wrong +            assert ae.status == 404 +        self._orcid_id_map[orcid] = creator_id # might be None +        return creator_id + +    def is_doi(self, doi): +        return doi.startswith("10.") and doi.count("/") >= 1 + +    def lookup_doi(self, doi): +        """Caches calls to the doi lookup API endpoint in a local dict""" +        assert self.is_doi(doi) +        doi = doi.lower() +        if doi in self._doi_id_map: +            return self._doi_id_map[doi] +        release_id = None +        try: +            rv = self.api.lookup_release(doi=doi) +            release_id = rv.ident +        except ApiException as ae: +            # If anything other than a 404 (not found), something is wrong +            assert ae.status == 404 +        self._doi_id_map[doi] = release_id # might be None +        return release_id + +    def read_issn_map_file(self, issn_map_file): +        print("Loading ISSN map file...") +        self._issn_issnl_map = dict() +        for line in issn_map_file: +            if line.startswith("ISSN") or len(line) == 0: +                continue +            (issn, issnl) = line.split()[0:2] +            self._issn_issnl_map[issn] = issnl +            # double mapping makes lookups easy +            self._issn_issnl_map[issnl] = issnl +        print("Got {} ISSN-L mappings.".format(len(self._issn_issnl_map))) + +    def issn2issnl(self, issn): +        if issn is None: +            return None +        return self._issn_issnl_map.get(issn) diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py new file mode 100644 index 00000000..dddb58d1 --- /dev/null +++ b/python/fatcat_tools/importers/crossref.py @@ -0,0 +1,272 @@ + +import sys +import json +import sqlite3 +import datetime +import itertools +import fatcat_client +from fatcat_tools.importers.common import FatcatImporter + + +class FatcatCrossrefImporter(FatcatImporter): + +    def __init__(self, host_url, issn_map_file, extid_map_file=None, create_containers=True): +        super().__init__(host_url, issn_map_file) +        self.extid_map_db = None +        if extid_map_file: +            db_uri = "file:{}?mode=ro".format(extid_map_file) +            print("Using external ID map: {}".format(db_uri)) +            self.extid_map_db = sqlite3.connect(db_uri, uri=True) +        else: +            print("Not using external ID map") +        self.create_containers = create_containers + +    def lookup_ext_ids(self, doi): +        if self.extid_map_db is None: +            return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None) +        row = self.extid_map_db.execute("SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", +            [doi.lower()]).fetchone() +        if row is None: +            return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None) +        row = [str(cell or '') or None for cell in row] +        return dict( +            core_id=row[0], +            pmid=row[1], +            pmcid=row[2], +            wikidata_qid=row[3]) + +    def parse_crossref_dict(self, obj): +        """ +        obj is a python dict (parsed from json). +        returns a ReleaseEntity +        """ + +        # This work is out of scope if it doesn't have authors and a title +        if (not 'author' in obj) or (not 'title' in obj): +            return None + +        # Other ways to be out of scope (provisionally) +        if (not 'type' in obj): +            return None + +        # contribs +        def do_contribs(obj_list, ctype): +            contribs = [] +            for i, am in enumerate(obj_list): +                creator_id = None +                if 'ORCID' in am.keys(): +                    creator_id = self.lookup_orcid(am['ORCID'].split('/')[-1]) +                # Sorry humans :( +                if am.get('given') and am.get('family'): +                    raw_name = "{} {}".format(am['given'], am['family']) +                elif am.get('family'): +                    raw_name = am['family'] +                else: +                    # TODO: defaults back to a pseudo-null value +                    raw_name = am.get('given', '<blank>') +                extra = dict() +                if ctype == "author": +                    index = i +                else: +                    index = None +                if am.get('affiliation'): +                    # note: affiliation => affiliations +                    extra['affiliations'] = am.get('affiliation') +                if am.get('sequence') and am.get('sequence') != "additional": +                    extra['sequence'] = am.get('sequence') +                if not extra: +                    extra = None +                contribs.append(fatcat_client.ReleaseContrib( +                    creator_id=creator_id, +                    index=index, +                    raw_name=raw_name, +                    role=ctype, +                    extra=extra)) +            return contribs +        contribs = do_contribs(obj['author'], "author") +        contribs.extend(do_contribs(obj.get('editor', []), "editor")) +        contribs.extend(do_contribs(obj.get('translator', []), "translator")) + +        # container +        issn = obj.get('ISSN', [None])[0] +        issnl = self.issn2issnl(issn) +        container_id = None +        if issnl: +            container_id = self.lookup_issnl(issnl) +        publisher = obj.get('publisher') + +        ce = None +        if (container_id is None and self.create_containers and issnl != None  +            and obj.get('container-title') and len(obj['container-title']) > 0): +            ce = fatcat_client.ContainerEntity( +                issnl=issnl, +                publisher=publisher, +                name=obj['container-title'][0]) + +        # references +        refs = [] +        for i, rm in enumerate(obj.get('reference', [])): +            try: +                year = int(rm.get('year')) +                # NOTE: will need to update/config in the future! +                # NOTE: are there crossref works with year < 100? +                if year > 2025 or year < 100: +                    year = None +            except: +                year = None +            extra = rm.copy() +            if rm.get('DOI'): +                extra['doi'] = rm.get('DOI').lower() +            key = rm.get('key') +            if key and key.startswith(obj['DOI'].upper()): +                key = key.replace(obj['DOI'].upper() + "-", '') +                key = key.replace(obj['DOI'].upper(), '') +            container_name = rm.get('volume-title') +            if not container_name: +                container_name = rm.get('journal-title') +            extra.pop('DOI', None) +            extra.pop('key', None) +            extra.pop('year', None) +            extra.pop('volume-name', None) +            extra.pop('journal-title', None) +            extra.pop('title', None) +            extra.pop('first-page', None) +            extra.pop('doi-asserted-by', None) +            if extra: +                extra = dict(crossref=extra) +            else: +                extra = None +            refs.append(fatcat_client.ReleaseRef( +                index=i, +                # doing lookups would be a second import pass +                target_release_id=None, +                key=key, +                year=year, +                container_name=container_name, +                title=rm.get('title'), +                locator=rm.get('first-page'), +                # TODO: just dump JSON somewhere here? +                extra=extra)) + +        # abstracts +        abstracts = [] +        if obj.get('abstract') != None: +            abstracts.append(fatcat_client.ReleaseEntityAbstracts( +                mimetype="application/xml+jats", +                content=obj.get('abstract'))) + +        # extra fields +        extra = dict() +        for key in ('subject', 'type', 'license', 'alternative-id', +                'container-title', 'original-title', 'subtitle', 'archive', +                'funder', 'group-title'): +            # TODO: unpack "container-title" array +            val = obj.get(key) +            if val: +                extra[key] = val +        if 'license' in extra and extra['license']: +            for i in range(len(extra['license'])): +                if 'start' in extra['license'][i]: +                    extra['license'][i]['start'] = extra['license'][i]['start']['date-time'] +        if len(obj['title']) > 1: +            extra['other-titles'] = obj['title'][1:] +        # TODO: this should be top-level +        extra['is_kept'] = len(obj.get('archive', [])) > 0 + +        # ISBN +        isbn13 = None +        for raw in obj.get('ISBN', []): +            # TODO: convert if not ISBN-13 format +            if len(raw) == 17: +                isbn13 = raw +                break + +        # release status +        if obj['type'] in ('journal-article', 'conference-proceeding', 'book', +                'dissertation', 'book-chapter'): +            release_status = "published" +        else: +            # unknown +            release_status = None + +        # external identifiers +        extids = self.lookup_ext_ids(doi=obj['DOI'].lower()) + +        # TODO: filter out huge releases; we'll get them later (and fix bug in +        # fatcatd) +        if max(len(contribs), len(refs), len(abstracts)) > 750: +            return None + +        # release date parsing is amazingly complex +        release_date = obj['issued']['date-parts'][0] +        if not release_date or not release_date[0]: +            # got some NoneType, even though at least year is supposed to be set +            release_date = None +        elif len(release_date) == 3: +            release_date = datetime.datetime(year=release_date[0], month=release_date[1], day=release_date[2]) +        else: +            # only the year is actually required; mangle to first day for date +            # (TODO: something better?) +            release_date = datetime.datetime(year=release_date[0], month=1, day=1) +        # convert to string ISO datetime format (if not null) +        if release_date: +            release_date = release_date.isoformat() + "Z" + +        re = fatcat_client.ReleaseEntity( +            work_id=None, +            title=obj['title'][0], +            contribs=contribs, +            refs=refs, +            container_id=container_id, +            publisher=publisher, +            release_type=obj['type'], +            release_status=release_status, +            doi=obj['DOI'].lower(), +            isbn13=isbn13, +            core_id=extids['core_id'], +            pmid=extids['pmid'], +            pmcid=extids['pmcid'], +            wikidata_qid=extids['wikidata_qid'], +            release_date=release_date, +            issue=obj.get('issue'), +            volume=obj.get('volume'), +            pages=obj.get('page'), +            abstracts=abstracts, +            extra=dict(crossref=extra)) +        return (re, ce) + +    def create_row(self, row, editgroup=None): +        if row is None: +            return +        obj = json.loads(row) +        entities = self.parse_crossref_dict(obj) +        if entities is not None: +            (re, ce) = entities +            if ce is not None: +                container = self.api.create_container(ce, editgroup=editgroup) +                re.container_id = container.ident +                self._issnl_id_map[ce.issnl] = container.ident +            self.api.create_release(re, editgroup=editgroup) +            self.insert_count = self.insert_count + 1 + +    def create_batch(self, batch, editgroup=None): +        """Current work/release pairing disallows batch creation of releases. +        Could do batch work creation and then match against releases, but meh.""" +        release_batch = [] +        for row in batch: +            if row is None: +                continue +            obj = json.loads(row) +            entities = self.parse_crossref_dict(obj) +            if entities is not None: +                (re, ce) = entities +                if ce is not None: +                    ce_eg = self.api.create_editgroup( +                        fatcat_client.Editgroup(editor_id='aaaaaaaaaaaabkvkaaaaaaaaae')) +                    container = self.api.create_container(ce, editgroup=ce_eg.id) +                    self.api.accept_editgroup(ce_eg.id) +                    re.container_id = container.ident +                    self._issnl_id_map[ce.issnl] = container.ident +                release_batch.append(re) +        self.api.create_release_batch(release_batch, autoaccept="true", editgroup=editgroup) +        self.insert_count = self.insert_count + len(release_batch) diff --git a/python/fatcat_tools/importers/grobid_metadata.py b/python/fatcat_tools/importers/grobid_metadata.py new file mode 100644 index 00000000..56b2ee02 --- /dev/null +++ b/python/fatcat_tools/importers/grobid_metadata.py @@ -0,0 +1,168 @@ +#!/usr/bin/env python3 + +import sys +import json +import base64 +import datetime +import fatcat_client +from fatcat_tools.importers.common import FatcatImporter + +MAX_ABSTRACT_BYTES=4096 + + +class FatcatGrobidMetadataImporter(FatcatImporter): + +    def __init__(self, host_url, default_link_rel="web"): +        super().__init__(host_url) +        self.default_link_rel = default_link_rel + +    def parse_grobid_json(self, obj): + +        if not obj.get('title'): +            return None + +        release = dict() +        extra = dict() + +        if obj.get('abstract') and len(obj.get('abstract')) < MAX_ABSTRACT_BYTES: +            abobj = dict( +                mimetype="text/plain", +                language=None, +                content=obj.get('abstract').strip()) +            abstracts = [abobj] +        else: +            abstracts = None + +        contribs = [] +        for i, a in enumerate(obj.get('authors', [])): +            c = dict(raw_name=a['name'], role="author") +            contribs.append(fatcat_client.ReleaseContrib( +                index=i, +                raw_name=a['name'], +                role="author", +                extra=None)) + +        refs = [] +        for raw in obj.get('citations', []): +            cite_extra = dict() +            ref = dict() +            ref['key'] = raw.get('id') +            if raw.get('title'): +                ref['title'] = raw['title'].strip() +            if raw.get('date'): +                try: +                    year = int(raw['date'].strip()[:4]) +                    ref['year'] = year +                except: +                    pass +            for key in ('volume', 'url', 'issue', 'publisher'): +                if raw.get(key): +                    cite_extra[key] = raw[key].strip() +            if raw.get('authors'): +                cite_extra['authors'] = [a['name'] for a in raw['authors']] +            if cite_extra: +                cite_extra = dict(grobid=cite_extra) +            else: +                cite_extra = None +            ref['extra'] = cite_extra +            refs.append(ref) + +        release_type = "journal-article" +        release_date = None +        if obj.get('date'): +            # TODO: only returns year, ever? how to handle? +            release_date = datetime.datetime(year=int(obj['date'][:4]), month=1, day=1) + +        if obj.get('doi'): +            extra['doi'] = obj['doi'] +        if obj['journal'] and obj['journal'].get('name'): +            extra['container_name'] = obj['journal']['name'] +         +        extra['is_longtail_oa'] = True + +        # TODO: ISSN/eISSN handling? or just journal name lookup? + +        if extra: +            extra = dict(grobid=extra) +        else: +            extra = None + +        re = fatcat_client.ReleaseEntity( +            title=obj['title'].strip(), +            contribs=contribs, +            refs=refs, +            publisher=obj['journal'].get('publisher'), +            volume=obj['journal'].get('volume'), +            issue=obj['journal'].get('issue'), +            abstracts=abstracts, +            extra=extra) +        return re +     +    # TODO: make this a common function somewhere +    def make_url(self, raw): +        rel = self.default_link_rel +        # TODO: this is where we could map specific domains to rel types, +        # and also filter out bad domains, invalid URLs, etc +        if "//archive.org/" in raw or "//arxiv.org/" in raw: +            # TODO: special-case the arxiv.org bulk mirror? +            rel = "repository" +        elif "//web.archive.org/" in raw or "//archive.is/" in raw: +            rel = "webarchive" +        return fatcat_client.FileEntityUrls(url=raw, rel=rel) + +    def parse_file_metadata(self, sha1_key, cdx, mimetype, file_size): +         +        sha1 = base64.b16encode(base64.b32decode(sha1_key.replace('sha1:', ''))).decode('ascii').lower() + +        # lookup existing SHA1, or create new entity +        try: +            existing_file = self.api.lookup_file(sha1=sha1) +        except fatcat_client.rest.ApiException as err: +            if err.status != 404: +                raise err +            existing_file = None + +        if existing_file: +            # if file is already in here, presumably not actually long-tail +            return None +        fe = fatcat_client.FileEntity( +            sha1=sha1, +            size=int(file_size), +            mimetype=mimetype, +            releases=[], +            urls=[], +        ) + +        # parse URLs and CDX +        original = cdx['url'] +        wayback = "https://web.archive.org/web/{}/{}".format( +            cdx['dt'], +            original) +        fe.urls.append( +            fatcat_client.FileEntityUrls(url=wayback, rel="webarchive")) +        original_url = self.make_url(original) +        if original_url != None: +            fe.urls.append(original_url) + +        return fe + +    def create_row(self, row, editgroup=None): +        if not row: +            return +        fields = row.split('\t') +        sha1_key = fields[0] +        cdx = json.loads(fields[1]) +        mimetype = fields[2] +        file_size = int(fields[3]) +        grobid_meta = json.loads(fields[4]) +        fe = self.parse_file_metadata(sha1_key, cdx, mimetype, file_size) +        re = self.parse_grobid_json(grobid_meta) +        if fe and re: +            release_entity = self.api.create_release(re, editgroup=editgroup) +            # release ident can't already be in release list because we just +            # created it +            fe.releases.append(release_entity.ident) +            file_entity = self.api.create_file(fe, editgroup=editgroup) +            self.insert_count = self.insert_count + 1 + +    # NB: batch mode not implemented diff --git a/python/fatcat_tools/importers/issn.py b/python/fatcat_tools/importers/issn.py new file mode 100644 index 00000000..d7fb9082 --- /dev/null +++ b/python/fatcat_tools/importers/issn.py @@ -0,0 +1,72 @@ + +import sys +import json +import itertools +import fatcat_client +from fatcat_tools.importers.common import FatcatImporter + +# CSV format (generated from git.archive.org/webgroup/oa-journal-analysis): +# ISSN-L,in_doaj,in_road,in_norwegian,in_crossref,title,publisher,url,lang,ISSN-print,ISSN-electronic,doi_count,has_doi,is_oa,is_kept,publisher_size,url_live,url_live_status,url_live_final_status,url_live_final_url,url_live_status_simple,url_live_final_status_simple,url_domain,gwb_pdf_count + +def or_none(s): +    if s is None: +        return None +    if len(s) == 0: +        return None +    return s + +def truthy(s): +    if s is None: +        return None +    s = s.lower() +    if s in ('true', 't', 'yes', 'y', '1'): +        return True +    elif s in ('false', 'f', 'no', 'n', '0'): +        return False +    else: +        return None + +class FatcatIssnImporter(FatcatImporter): + +    def parse_issn_row(self, row): +        """ +        row is a python dict (parsed from CSV). +        returns a ContainerEntity +        """ +        title = or_none(row['title']) +        issnl = or_none(row['ISSN-L']) +        if title is None or issnl is None: +            return +        extra = dict( +            in_doaj=truthy(row['in_doaj']), +            in_road=truthy(row['in_road']), +            in_norwegian=truthy(row['in_norwegian']), +            language=or_none(row['lang']), +            url=or_none(row['url']), +            ISSNp=or_none(row['ISSN-print']), +            ISSNe=or_none(row['ISSN-electronic']), +            is_oa=truthy(row['is_oa']), +            is_kept=truthy(row['is_kept']), +        ) +        ce = fatcat_client.ContainerEntity( +            issnl=issnl, +            name=title, +            publisher=or_none(row['publisher']), +            abbrev=None, +            coden=None, +            extra=extra) +        return ce + +    def create_row(self, row, editgroup=None): +        ce = self.parse_issn_row(row) +        if ce is not None: +            self.api.create_container(ce, editgroup=editgroup) +            self.insert_count = self.insert_count + 1 + +    def create_batch(self, batch, editgroup=None): +        """Reads and processes in batches (not API-call-per-line)""" +        objects = [self.parse_issn_row(l) +                   for l in batch if l != None] +        objects = [o for o in objects if o != None] +        self.api.create_container_batch(objects, autoaccept="true", editgroup=editgroup) +        self.insert_count = self.insert_count + len(objects) diff --git a/python/fatcat_tools/importers/matched.py b/python/fatcat_tools/importers/matched.py new file mode 100644 index 00000000..6270fe88 --- /dev/null +++ b/python/fatcat_tools/importers/matched.py @@ -0,0 +1,144 @@ + +import sys +import json +import sqlite3 +import itertools +import fatcat_client +from fatcat_tools.importers.common import FatcatImporter + +#row = row.split('\t') +#assert len(row) == 2 +#sha1 = row[0].replace('sha1:') +#sha1 = base64.b16encode(base64.b32decode(sha1)).lower() +#print(sha1) +#dois = [d.lower() for d in json.loads(row[1])] + +class FatcatMatchedImporter(FatcatImporter): +    """ +    Input format is JSON with keys: +    - dois (list) +    - sha1 (hex) +    - md5 (hex) +    - sha256 (hex) +    - size (int) +    - cdx (list of objects) +        - dt +        - url +    - mimetype +    - urls (list of strings... or objects?) + +    Future handlings/extensions: +    - core_id, wikidata_id, pmcid, pmid: not as lists +    """ + +    def __init__(self, host_url, skip_file_update=False, default_mime=None, +            default_link_rel="web"): +        super().__init__(host_url) +        self.default_mime = default_mime +        self.default_link_rel = default_link_rel +        self.skip_file_update = skip_file_update + +    def make_url(self, raw): +        rel = self.default_link_rel +        # TODO: this is where we could map specific domains to rel types, +        # and also filter out bad domains, invalid URLs, etc +        if "//archive.org/" in raw or "//arxiv.org/" in raw: +            # TODO: special-case the arxiv.org bulk mirror? +            rel = "repository" +        elif "//web.archive.org/" in raw or "//archive.is/" in raw: +            rel = "webarchive" +        return fatcat_client.FileEntityUrls(url=raw, rel=rel) + +    def parse_matched_dict(self, obj): +        sha1 = obj['sha1'] +        dois = [d.lower() for d in obj.get('dois', [])] + +        # lookup sha1, or create new entity +        fe = None +        if not self.skip_file_update: +            try: +                fe = self.api.lookup_file(sha1=sha1) +            except fatcat_client.rest.ApiException as err: +                if err.status != 404: +                    raise err +        if fe is None: +            fe = fatcat_client.FileEntity( +                sha1=sha1, +                releases=[], +                urls=[], +            ) + +        # lookup dois +        re_list = set() +        for doi in dois: +            try: +                re = self.api.lookup_release(doi=doi) +            except fatcat_client.rest.ApiException as err: +                if err.status != 404: +                    raise err +                re = None +            if re is None: +                print("DOI not found: {}".format(doi)) +            else: +                re_list.add(re.ident) +        if len(re_list) == 0: +            return None +        if fe.releases == set(re_list): +            return None +        re_list.update(fe.releases) +        fe.releases = list(re_list) + +        # parse URLs and CDX +        existing_urls = [feu.url for feu in fe.urls] +        for url in obj.get('url', []): +            if url not in existing_urls: +                url = self.make_url(url) +                if url != None: +                    fe.urls.append(url) +        for cdx in obj.get('cdx', []): +            original = cdx['url'] +            wayback = "https://web.archive.org/web/{}/{}".format( +                cdx['dt'], +                original) +            if wayback not in existing_urls: +                fe.urls.append( +                    fatcat_client.FileEntityUrls(url=wayback, rel="webarchive")) +            if original not in existing_urls: +                url = self.make_url(original) +                if url != None: +                    fe.urls.append(url) + +        if obj.get('size') != None: +            fe.size = int(obj['size']) +        fe.sha256 = obj.get('sha256', fe.sha256) +        fe.md5 = obj.get('md5', fe.sha256) +        if obj.get('mimetype') is None: +            if fe.mimetype is None: +                fe.mimetype = self.default_mime +        else: +            fe.mimetype = obj.get('mimetype') +        return fe + +    def create_row(self, row, editgroup=None): +        obj = json.loads(row) +        fe = self.parse_matched_dict(obj) +        if fe is not None: +            if fe.ident is None: +                self.api.create_file(fe, editgroup=editgroup) +                self.insert_count = self.insert_count + 1 +            else: +                self.api.update_file(fe.ident, fe, editgroup=editgroup) +                self.update_count = self.update_count + 1 + +    def create_batch(self, batch, editgroup=None): +        """Reads and processes in batches (not API-call-per-line)""" +        objects = [self.parse_matched_dict(json.loads(l)) +                   for l in batch if l != None] +        new_objects = [o for o in objects if o != None and o.ident == None] +        update_objects = [o for o in objects if o != None and o.ident != None] +        for obj in update_objects: +            self.api.update_file(obj.ident, obj, editgroup=editgroup) +        if len(new_objects) > 0: +            self.api.create_file_batch(new_objects, autoaccept="true", editgroup=editgroup) +        self.update_count = self.update_count + len(update_objects) +        self.insert_count = self.insert_count + len(new_objects) diff --git a/python/fatcat_tools/importers/orcid.py b/python/fatcat_tools/importers/orcid.py new file mode 100644 index 00000000..350c4c57 --- /dev/null +++ b/python/fatcat_tools/importers/orcid.py @@ -0,0 +1,73 @@ + +import sys +import json +import itertools +import fatcat_client +from fatcat_tools.importers.common import FatcatImporter + +def value_or_none(e): +    if type(e) == dict: +        e = e.get('value') +    if type(e) == str and len(e) == 0: +        e = None +    # TODO: this is probably bogus; patched in desperation; remove? +    if e: +        try: +            e.encode() +        except UnicodeEncodeError: +            # Invalid JSON? +            print("BAD UNICODE") +            return None +    return e + +class FatcatOrcidImporter(FatcatImporter): + +    def parse_orcid_dict(self, obj): +        """ +        obj is a python dict (parsed from json). +        returns a CreatorEntity +        """ +        name = obj['person']['name'] +        if name is None: +            return None +        extra = None +        given = value_or_none(name.get('given-names')) +        sur = value_or_none(name.get('family-name')) +        display = value_or_none(name.get('credit-name')) +        if display is None: +            # TODO: sorry human beings +            if given and sur: +                display = "{} {}".format(given, sur) +            elif sur: +                display = sur +            elif given: +                display = given +            else: +                # must have *some* name +                return None +        orcid = obj['orcid-identifier']['path'] +        if not self.is_orcid(orcid): +            sys.stderr.write("Bad ORCID: {}\n".format(orcid)) +            return None +        ce = fatcat_client.CreatorEntity( +            orcid=orcid, +            given_name=given, +            surname=sur, +            display_name=display, +            extra=extra) +        return ce + +    def create_row(self, row, editgroup=None): +        obj = json.loads(row) +        ce = self.parse_orcid_dict(obj) +        if ce is not None: +            self.api.create_creator(ce, editgroup=editgroup) +            self.insert_count = self.insert_count + 1 + +    def create_batch(self, batch, editgroup=None): +        """Reads and processes in batches (not API-call-per-line)""" +        objects = [self.parse_orcid_dict(json.loads(l)) +                   for l in batch if l != None] +        objects = [o for o in objects if o != None] +        self.api.create_creator_batch(objects, autoaccept="true", editgroup=editgroup) +        self.insert_count = self.insert_count + len(objects) | 
