diff options
| author | Bryan Newbold <bnewbold@robocracy.org> | 2018-06-20 09:37:37 -0700 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@robocracy.org> | 2018-06-20 09:37:37 -0700 | 
| commit | bde5c8f14e13afe4d54e9bfafd8bda8b0f33f804 (patch) | |
| tree | 67d7039b1621bebdafd89539602c2b5d05332501 /python | |
| parent | 698399c49edcefe33c012856b604985925969a77 (diff) | |
| download | fatcat-bde5c8f14e13afe4d54e9bfafd8bda8b0f33f804.tar.gz fatcat-bde5c8f14e13afe4d54e9bfafd8bda8b0f33f804.zip | |
python: refactor importer code (+crossref)
Diffstat (limited to 'python')
| -rwxr-xr-x | python/client.py | 19 | ||||
| -rw-r--r-- | python/fatcat/api_client.py | 177 | ||||
| -rw-r--r-- | python/fatcat/crossref_importer.py | 112 | ||||
| -rw-r--r-- | python/fatcat/importer_common.py | 53 | ||||
| -rw-r--r-- | python/fatcat/orcid_importer.py | 49 | ||||
| -rw-r--r-- | python/fatcat/raw_api_client.py | 66 | ||||
| -rw-r--r-- | python/tests/api_client.py | 15 | ||||
| -rw-r--r-- | python/tests/crossref.py | 16 | ||||
| -rw-r--r-- | python/tests/fixtures.py | 17 | ||||
| -rw-r--r-- | python/tests/orcid.py | 2 | 
10 files changed, 274 insertions, 252 deletions
| diff --git a/python/client.py b/python/client.py index 14814512..9631318a 100755 --- a/python/client.py +++ b/python/client.py @@ -2,21 +2,21 @@  import sys  import argparse -from fatcat.api_client import FatCatApiClient +from fatcat.raw_api_client import RawFatcatApiClient  from fatcat.orcid_importer import FatcatOrcidImporter  def run_import_crossref(args): -    fcc = FatCatApiClient(args.host_url) -    fcc.import_crossref_file(args.json_file, -        create_containers=args.create_containers) +    fcc = FatcatCrossrefClient(args.host_url) +    fcc.import_crossref_file(args.json_file) +    # create_containers=args.create_containers  def run_import_orcid(args):      foi = FatcatOrcidImporter(args.host_url)      foi.process_batch(args.json_file, size=args.batch_size)  def health(args): -    fcc = FatCatApiClient(args.host_url) -    print(fcc.health()) +    rfac = RawFatcatApiClient(args.host_url) +    print(rfac.health())  def main():      parser = argparse.ArgumentParser() @@ -32,9 +32,10 @@ def main():      sub_import_crossref.set_defaults(func=run_import_crossref)      sub_import_crossref.add_argument('json_file',          help="crossref JSON file to import from") -    sub_import_crossref.add_argument('--create-containers', -        action='store_true', -        help="if true, create containers based on ISSN") +    # TODO: +    #sub_import_crossref.add_argument('--create-containers', +    #    action='store_true', +    #    help="if true, create containers based on ISSN")      sub_import_orcid = subparsers.add_parser('import-orcid')      sub_import_orcid.set_defaults(func=run_import_orcid) diff --git a/python/fatcat/api_client.py b/python/fatcat/api_client.py deleted file mode 100644 index 4c000609..00000000 --- a/python/fatcat/api_client.py +++ /dev/null @@ -1,177 +0,0 @@ - -import sys -import json -import requests - - -class FatCatApiClient: - -    def __init__(self, host_url): -        self.host_url = host_url -        self.session = requests.Session() -        self._issn_map = dict() - -    def get(self, path, data=None): -        headers = {"content-type": "application/json"} -        return self.session.get(self.host_url + path, json=data, -            headers=headers) - -    def post(self, path, data=None): -        headers = {"content-type": "application/json"} -        return self.session.post(self.host_url + path, json=data, -            headers=headers) - -    def new_editgroup(self): -        rv = self.post('/v0/editgroup', data=dict( -            editor_id=1)) -        print(rv) -        print(rv.json()) -        assert rv.status_code == 201 -        editgroup_id = rv.json()['id'] -        return editgroup_id - -    def accept_editgroup(self, eg): -        rv = self.post('/v0/editgroup/{}/accept'.format(eg)) -        assert rv.status_code == 200 -        return rv - -    def lookup_issn(self, issn): -        assert len(issn) == 9 and issn[4] == '-' -        if issn in self._issn_map: -            return self._issn_map[issn] -        rv = self.get('/v0/container/lookup', data=dict(issn=issn)) -        container_id = None -        if rv.status_code == 200: -            container_id = rv.json()['id'] -        else: -            # only other valid response is a 404; otherwise we had an error -            assert rv.status_code == 404 -        self._issn_map[issn] = container_id -        return container_id - -    def import_crossref_file(self, json_file, create_containers=False, batchsize=100): -        eg = self.new_editgroup() -        i = 0 -        with open(json_file, 'r') as file: -            for line in file: -                if i % batchsize == 0: -                    sys.stdout.write('\n{}: '.format(i)) -                if (i+1) % 20 == 0: -                    sys.stdout.write('.') -                i = i + 1 -                obj = json.loads(line) -                if not ("author" in obj and "title" in obj): -                    continue -                try: -                    self.import_crossref_dict(obj, editgroup=eg, -                        create_containers=create_containers) -                except Exception as e: -                    print("ERROR: {}".format(e)) -                if i % batchsize == 0: -                    self.accept_editgroup(eg) -                    eg = self.new_editgroup() -        if i % batchsize != 0: -            self.accept_editgroup(eg) -        print("done!") - -    def import_crossref_dict(self, meta, editgroup=None, -            create_containers=False): - -        # creators -        creators = [] -        for am in meta['author']: -            c = dict(name="{} {}".format(am['given'], am['family']), -                     sortname="{}, {}".format(am['family'], am['given']), -                     orcid=None) -            creators.append(c) - -        # container -        issn = meta.get('ISSN', [None])[0] -        container_id = self.lookup_issn(issn) -        container = dict( -            issn=issn, -            name=meta['container-title'][0], -            container=container_id, -            #sortname=meta['short-container-title'][0]) -            publisher=meta['publisher']) - -        if container_id is None and create_containers and issn != None: -            rv = self.post('/v0/container', data=dict( -                issn=container['issn'], -                publisher=container['publisher'])) -            assert rv.status_code == 201 -            container_id = rv.json()['id'] -            print("created container: {}".format(issn)) -            container['id'] = container_id -            self._issn_map[issn] = container_id - -        # references -        refs = [] -        for i, rm in enumerate(meta.get('reference', [])): -            ref = dict( -                doi=rm.get("DOI", None), -                index=i+1, -                # TODO: how to generate a proper stub here from k/v metadata? -                stub="| ".join(rm.values())) -            refs.append(ref) - -        # work and release -        title = meta['title'][0] -        rv = self.post('/v0/work', -            data=dict(title=title, editgroup=editgroup)) #work_type="book" -        assert rv.status_code == 201 -        work_id = rv.json()['id'] - -        extra = dict(crossref={ -            'links': meta.get('link', []), -            'subject': meta.get('subject'), -            'type': meta['type'], -            'alternative-id': meta.get('alternative-id', [])}) - -        rv = self.post('/v0/release', data=dict( -            title=title, -            work=work_id, -            # XXX: creators=creators, -            # XXX: refs=refs, -            # XXX: container=container_id, -            release_type=meta['type'], -            doi=meta['DOI'], -            date=meta['created']['date-time'], -            license=meta.get('license', [dict(URL=None)])[0]['URL'] or None, -            issue=meta.get('issue', None), -            volume=meta.get('volume', None), -            pages=meta.get('page', None), -            editgroup=editgroup, -            extra=extra)) -        assert rv.status_code == 201 -        release_id = rv.json()['id'] - -    def import_issn_file(self, json_file, create_containers=False, batchsize=100): -        eg = self.new_editgroup() -        i = 0 -        with open(json_file, 'r') as file: -            for line in file: -                if i % batchsize == 0: -                    sys.stdout.write('\n{}: '.format(i)) -                if (i+1) % 20 == 0: -                    sys.stdout.write('.') -                i = i + 1 -                obj = json.loads(line) -                if not ("author" in obj and "title" in obj): -                    continue -                try: -                    self.import_crossref_dict(obj, editgroup=eg, -                        create_containers=create_containers) -                except Exception as e: -                    print("ERROR: {}".format(e)) -                if i % batchsize == 0: -                    self.accept_editgroup(eg) -                    eg = self.new_editgroup() -        if i % batchsize != 0: -            self.accept_editgroup(eg) -        print("done!") - -    def health(self): -        rv = self.get("/health") -        assert rv.status_code == 200 -        return rv.json() diff --git a/python/fatcat/crossref_importer.py b/python/fatcat/crossref_importer.py new file mode 100644 index 00000000..4c68230d --- /dev/null +++ b/python/fatcat/crossref_importer.py @@ -0,0 +1,112 @@ + +import sys +import json +import itertools +import fatcat_client +from fatcat.importer_common import FatcatImporter + + +class FatcatCrossrefImporter(FatcatImporter): + +    # TODO: overload __init__ to handle create_containers + +    def parse_crossref_dict(self, obj): +        """ +        obj is a python dict (parsed from json). +        returns a ReleaseEntity +        """ + +        # contribs +        contribs = [] +        for i, am in enumerate(obj['author']): +            contribs.append(fatcat_client.ReleaseContrib( +                creator_id=None, # TODO: orcid lookup +                index=i, +                # Sorry humans :( +                raw="{} {}".format(am['given'], am['family']), +                role="author")) + +        # container +        # TODO: ISSN vs. ISSN-L +        issn = obj.get('ISSN', [None])[0] +        container_id = self.lookup_issnl(issn) + +        ## TODO: create containers in-line like this? +        #container = dict( +        #    issn=issn, +        #    name=obj['container-title'][0], +        #    container=container_id, +        #    #sortname=obj['short-container-title'][0]) +        #    publisher=obj['publisher']) +        #if container_id is None and self.create_containers and issn != None: +        #    rv = self.post('/v0/container', data=dict( +        #        issn=container['issn'], +        #        publisher=container['publisher'])) +        #    assert rv.status_code == 201 +        #    container_id = rv.json()['id'] +        #    print("created container: {}".format(issn)) +        #    container['id'] = container_id +        #    self._issn_map[issn] = container_id + +        # references +        refs = [] +        for i, rm in enumerate(obj.get('reference', [])): +            refs.append(fatcat_client.ReleaseRef( +                index=i, +                target_release_id=None, # TODO: DOI lookup: rm.get("DOI", None), +                # TODO: all these +                key=None, +                year=None, +                container_title=None, +                title=None, +                locator=None, +                # TODO: how to generate a proper stub here from k/v objdata? +                # TODO: just dump JSON here if we didn't get a match? +                raw="| ".join(rm.values()))) + +        # work +        we = fatcat_client.WorkEntity( +            work_type=obj['type'], +        ) + +        # release +        extra = dict(crossref={ +            'links': obj.get('link', []), +            'subject': obj.get('subject'), +            'crossref-type': obj['type'], +            'alternative-id': obj.get('alternative-id', [])}) + +        re = fatcat_client.ReleaseEntity( +            work_id='null', # XXX: +            title=obj['title'][0], +            contribs=contribs, +            refs=refs, +            container_id=container_id, +            release_type=obj['type'], +            doi=obj['DOI'], +            release_date=obj['created']['date-time'], +            #license=obj.get('license', [dict(URL=None)])[0]['URL'] or None, +            issue=obj.get('issue'), +            volume=obj.get('volume'), +            pages=obj.get('page'), +            extra=extra) +        return (we, re) + +    def create_row(self, row, editgroup_id=None): +        if row is None: +            continue +        obj = json.loads(row) +        both = self.parse_crossref_dict(obj) +        if both is not None: +            (we, re) = both +            we.editgroup_id = editgroup_id +            re.editgroup_id = editgroup_id +            created = self.api.create_work(we) +            re.work_id = created.ident +            self.api.create_release(re) + +    def create_batch(self, batch, editgroup_id=None): +        """Current work/release pairing disallows batch creation of releases. +        Could do batch work creation and then match against releases, but meh.""" +        for row in batch: +            self.create_row(row, editgroup_id) diff --git a/python/fatcat/importer_common.py b/python/fatcat/importer_common.py new file mode 100644 index 00000000..98bfb26e --- /dev/null +++ b/python/fatcat/importer_common.py @@ -0,0 +1,53 @@ + +import sys +import json +import itertools +import fatcat_client +from fatcat_client.rest import ApiException + +# from: https://docs.python.org/3/library/itertools.html +def grouper(iterable, n, fillvalue=None): +    "Collect data into fixed-length chunks or blocks" +    args = [iter(iterable)] * n +    return itertools.zip_longest(*args, fillvalue=fillvalue) + +class FatcatImporter: + +    def __init__(self, host_url): +        conf = fatcat_client.Configuration() +        conf.host = host_url +        self.api = fatcat_client.DefaultApi(fatcat_client.ApiClient(conf)) +        self._issnl_map = dict() + +    def process_source(self, source, group_size=100): +        """Creates and auto-accepts editgropu every group_size rows""" +        eg = self.api.create_editgroup(fatcat_client.Editgroup(editor_id=1)) +        for i, row in enumerate(source): +            self.create_row(row, editgroup_id=eg.id) +            if i > 0 and (i % group_size) == 0: +                self.api.accept_editgroup(eg) +                eg = self.api.create_editgroup(fatcat_client.Editgroup(editor_id=1)) +        if i == 0 or (i % group_size) != 0: +            self.api.accept_editgroup(eg.id) + +    def process_batch(self, source, size=50): +        """Reads and processes in batches (not API-call-per-)""" +        for rows in grouper(source, size): +            eg = self.api.create_editgroup(fatcat_client.Editgroup(editor_id=1)) +            self.create_batch(rows, eg.id) +            self.api.accept_editgroup(eg.id) + +    def lookup_issnl(self, issnl): +        """Caches calls to the ISSN-L lookup API endpoint in a local dict""" +        assert len(issnl) == 9 and issnl[4] == '-' +        if issnl in self._issnl_map: +            return self._issnl_map[issn] +        container_id = None +        try: +            rv = self.api.lookup_container(issnl=issnl) +            container_id = rv.ident +        except ApiException as ae: +            # If anything other than a 404 (not found), something is wrong +            assert ae.status == 404 +        self._issnl_map[issnl] = container_id # might be None +        return container_id diff --git a/python/fatcat/orcid_importer.py b/python/fatcat/orcid_importer.py index ba8d0bd7..fb4716df 100644 --- a/python/fatcat/orcid_importer.py +++ b/python/fatcat/orcid_importer.py @@ -3,6 +3,8 @@ import sys  import json  import itertools  import fatcat_client +from fatcat.importer_common import FatcatImporter +  def value_or_none(e):      if type(e) == dict: @@ -11,18 +13,7 @@ def value_or_none(e):          e = None      return e -# from: https://docs.python.org/3/library/itertools.html -def grouper(iterable, n, fillvalue=None): -    "Collect data into fixed-length chunks or blocks" -    args = [iter(iterable)] * n -    return itertools.zip_longest(*args, fillvalue=fillvalue) - -class FatcatOrcidImporter: - -    def __init__(self, host_url): -        conf = fatcat_client.Configuration() -        conf.host = host_url -        self.api = fatcat_client.DefaultApi(fatcat_client.ApiClient(conf)) +class FatcatOrcidImporter(FatcatImporter):      def parse_orcid_dict(self, obj):          """ @@ -47,34 +38,18 @@ class FatcatOrcidImporter:              extra=extra)          return ce -    def process_line(self, line, editgroup_id=None): -        """Doesn't accept the editgroup""" -        obj = json.loads(line) +    def create_row(self, row, editgroup_id=None): +        obj = json.loads(row)          ce = self.parse_orcid_dict(obj)          if ce is not None:              ce.editgroup_id = editgroup_id              self.api.create_creator(ce) -    def process_source(self, source, group_size=100): -        """Creates and auto-accepts editgropu every group_size lines""" -        eg = self.api.create_editgroup(fatcat_client.Editgroup(editor_id=1)) -        for i, line in enumerate(source): -            self.process_line(line, editgroup_id=eg.id) -            if i > 0 and (i % group_size) == 0: -                self.api.accept_editgroup(eg) -                eg = self.api.create_editgroup(fatcat_client.Editgroup(editor_id=1)) -        if i == 0 or (i % group_size) != 0: -            self.api.accept_editgroup(eg.id) - -    def process_batch(self, source, size=50): +    def create_batch(self, batch, editgroup_id=None):          """Reads and processes in batches (not API-call-per-line)""" -        for lines in grouper(source, size): -            objects = [self.parse_orcid_dict(json.loads(l)) -                       for l in lines if l != None] -            objects = [o for o in objects if o != None] -            eg = self.api.create_editgroup(fatcat_client.Editgroup(editor_id=1)) -            for o in objects: -                o.editgroup_id = eg.id -            self.api.create_creator_batch(objects) -            self.api.accept_editgroup(eg.id) -            print("inserted {}".format(len(objects))) +        objects = [self.parse_orcid_dict(json.loads(l)) +                   for l in batch if l != None] +        objects = [o for o in objects if o != None] +        for o in objects: +            o.editgroup_id = editgroup_id +        self.api.create_creator_batch(objects) diff --git a/python/fatcat/raw_api_client.py b/python/fatcat/raw_api_client.py new file mode 100644 index 00000000..75151ebb --- /dev/null +++ b/python/fatcat/raw_api_client.py @@ -0,0 +1,66 @@ + +import sys +import json +import requests + + +class RawFatcatApiClient: + +    def __init__(self, host_url): +        self.host_url = host_url +        self.session = requests.Session() +        self._issn_map = dict() + +    def get(self, path, data=None): +        headers = {"content-type": "application/json"} +        return self.session.get(self.host_url + path, json=data, +            headers=headers) + +    def post(self, path, data=None): +        headers = {"content-type": "application/json"} +        return self.session.post(self.host_url + path, json=data, +            headers=headers) + +    def new_editgroup(self): +        rv = self.post('/v0/editgroup', data=dict( +            editor_id=1)) +        print(rv) +        print(rv.json()) +        assert rv.status_code == 201 +        editgroup_id = rv.json()['id'] +        return editgroup_id + +    def accept_editgroup(self, eg): +        rv = self.post('/v0/editgroup/{}/accept'.format(eg)) +        assert rv.status_code == 200 +        return rv + +    def import_issn_file(self, json_file, create_containers=False, batchsize=100): +        eg = self.new_editgroup() +        i = 0 +        with open(json_file, 'r') as file: +            for line in file: +                if i % batchsize == 0: +                    sys.stdout.write('\n{}: '.format(i)) +                if (i+1) % 20 == 0: +                    sys.stdout.write('.') +                i = i + 1 +                obj = json.loads(line) +                if not ("author" in obj and "title" in obj): +                    continue +                try: +                    self.import_crossref_dict(obj, editgroup=eg, +                        create_containers=create_containers) +                except Exception as e: +                    print("ERROR: {}".format(e)) +                if i % batchsize == 0: +                    self.accept_editgroup(eg) +                    eg = self.new_editgroup() +        if i % batchsize != 0: +            self.accept_editgroup(eg) +        print("done!") + +    def health(self): +        rv = self.get("/health") +        assert rv.status_code == 200 +        return rv.json() diff --git a/python/tests/api_client.py b/python/tests/api_client.py deleted file mode 100644 index 9d2ec302..00000000 --- a/python/tests/api_client.py +++ /dev/null @@ -1,15 +0,0 @@ - -import pytest -import fatcat.api_client -from fixtures import * - - -# TODO: -#def test_client_health(api_client): -#    assert api_client.health() != None - - -def test_import_crossref(api_client): -    api_client.import_crossref_file('tests/files/crossref-works.2018-01-21.badsample.json') - -    # TODO: use API to check that entities actually created... diff --git a/python/tests/crossref.py b/python/tests/crossref.py new file mode 100644 index 00000000..e72f7d02 --- /dev/null +++ b/python/tests/crossref.py @@ -0,0 +1,16 @@ + +import pytest +from fatcat.crossref_importer import FatcatCrossrefImporter + + +@pytest.fixture(scope="function") +def crossref_importer(): +    yield FatcatCrossrefImporter("http://localhost:9411/v0") + +def test_crossref_importer_batch(crossref_importer): +    with open('tests/files/crossref-works.2018-01-21.badsample.json', 'r') as f: +        crossref_importer.process_batch(f) + +def test_crossref_importer(crossref_importer): +    with open('tests/files/crossref-works.2018-01-21.badsample.json', 'r') as f: +        crossref_importer.process_source(f) diff --git a/python/tests/fixtures.py b/python/tests/fixtures.py index b1682e79..c9da9253 100644 --- a/python/tests/fixtures.py +++ b/python/tests/fixtures.py @@ -19,23 +19,12 @@ def app(full_app):  @pytest.fixture(scope="function") -def api_client(full_app): - -    # TODO: -    #pid = os.fork() -    #if pid == 0: -    #    full_app.testing = False -    #    full_app.run(host="localhost", port=8444, debug=False) -    #    os._exit(0) -    # -    #time.sleep(0.2) -    #yield fatcat.api_client.FatCatApiClient("http://localhost:8444") -    #os.kill(pid, signal.SIGKILL) - -    yield fatcat.api_client.FatCatApiClient("http://localhost:9411") +def raw_api_client(): +    yield fatcat.raw_api_client.RawFatcatApiClient("http://localhost:9411")  ## Helpers ################################################################## +# TODO: what are these even here for?  def check_entity_fields(e):      for key in ('rev', 'is_live', 'redirect_id'): diff --git a/python/tests/orcid.py b/python/tests/orcid.py index 86a23603..d0e99cfc 100644 --- a/python/tests/orcid.py +++ b/python/tests/orcid.py @@ -2,10 +2,12 @@  import pytest  from fatcat.orcid_importer import FatcatOrcidImporter +  @pytest.fixture(scope="function")  def orcid_importer():      yield FatcatOrcidImporter("http://localhost:9411/v0") +# TODO: use API to check that entities actually created...  def test_orcid_importer_batch(orcid_importer):      with open('tests/files/0000-0001-8254-7103.json', 'r') as f:          orcid_importer.process_batch(f) | 
