python: refactor importer code (+crossref)

author: Bryan Newbold <bnewbold@robocracy.org> 2018-06-20 09:37:37 -0700
committer: Bryan Newbold <bnewbold@robocracy.org> 2018-06-20 09:37:37 -0700
commit: bde5c8f14e13afe4d54e9bfafd8bda8b0f33f804 (patch)
tree: 67d7039b1621bebdafd89539602c2b5d05332501 /python
parent: 698399c49edcefe33c012856b604985925969a77 (diff)
download: fatcat-bde5c8f14e13afe4d54e9bfafd8bda8b0f33f804.tar.gz
fatcat-bde5c8f14e13afe4d54e9bfafd8bda8b0f33f804.zip
10 files changed, 274 insertions, 252 deletions
diff --git a/python/client.py b/python/client.py
index 14814512..9631318a 100755
--- a/python/client.py
+++ b/python/client.py
@@ -2,21 +2,21 @@
 
 import sys
 import argparse
-from fatcat.api_client import FatCatApiClient
+from fatcat.raw_api_client import RawFatcatApiClient
 from fatcat.orcid_importer import FatcatOrcidImporter
 
 def run_import_crossref(args):
-    fcc = FatCatApiClient(args.host_url)
-    fcc.import_crossref_file(args.json_file,
-        create_containers=args.create_containers)
+    fcc = FatcatCrossrefClient(args.host_url)
+    fcc.import_crossref_file(args.json_file)
+    # create_containers=args.create_containers
 
 def run_import_orcid(args):
     foi = FatcatOrcidImporter(args.host_url)
     foi.process_batch(args.json_file, size=args.batch_size)
 
 def health(args):
-    fcc = FatCatApiClient(args.host_url)
-    print(fcc.health())
+    rfac = RawFatcatApiClient(args.host_url)
+    print(rfac.health())
 
 def main():
     parser = argparse.ArgumentParser()
@@ -32,9 +32,10 @@ def main():
     sub_import_crossref.set_defaults(func=run_import_crossref)
     sub_import_crossref.add_argument('json_file',
         help="crossref JSON file to import from")
-    sub_import_crossref.add_argument('--create-containers',
-        action='store_true',
-        help="if true, create containers based on ISSN")
+    # TODO:
+    #sub_import_crossref.add_argument('--create-containers',
+    #    action='store_true',
+    #    help="if true, create containers based on ISSN")
 
     sub_import_orcid = subparsers.add_parser('import-orcid')
     sub_import_orcid.set_defaults(func=run_import_orcid)
diff --git a/python/fatcat/api_client.py b/python/fatcat/api_client.py
deleted file mode 100644
index 4c000609..00000000
--- a/python/fatcat/api_client.py
+++ /dev/null
@@ -1,177 +0,0 @@
-
-import sys
-import json
-import requests
-
-
-class FatCatApiClient:
-
-    def __init__(self, host_url):
-        self.host_url = host_url
-        self.session = requests.Session()
-        self._issn_map = dict()
-
-    def get(self, path, data=None):
-        headers = {"content-type": "application/json"}
-        return self.session.get(self.host_url + path, json=data,
-            headers=headers)
-
-    def post(self, path, data=None):
-        headers = {"content-type": "application/json"}
-        return self.session.post(self.host_url + path, json=data,
-            headers=headers)
-
-    def new_editgroup(self):
-        rv = self.post('/v0/editgroup', data=dict(
-            editor_id=1))
-        print(rv)
-        print(rv.json())
-        assert rv.status_code == 201
-        editgroup_id = rv.json()['id']
-        return editgroup_id
-
-    def accept_editgroup(self, eg):
-        rv = self.post('/v0/editgroup/{}/accept'.format(eg))
-        assert rv.status_code == 200
-        return rv
-
-    def lookup_issn(self, issn):
-        assert len(issn) == 9 and issn[4] == '-'
-        if issn in self._issn_map:
-            return self._issn_map[issn]
-        rv = self.get('/v0/container/lookup', data=dict(issn=issn))
-        container_id = None
-        if rv.status_code == 200:
-            container_id = rv.json()['id']
-        else:
-            # only other valid response is a 404; otherwise we had an error
-            assert rv.status_code == 404
-        self._issn_map[issn] = container_id
-        return container_id
-
-    def import_crossref_file(self, json_file, create_containers=False, batchsize=100):
-        eg = self.new_editgroup()
-        i = 0
-        with open(json_file, 'r') as file:
-            for line in file:
-                if i % batchsize == 0:
-                    sys.stdout.write('\n{}: '.format(i))
-                if (i+1) % 20 == 0:
-                    sys.stdout.write('.')
-                i = i + 1
-                obj = json.loads(line)
-                if not ("author" in obj and "title" in obj):
-                    continue
-                try:
-                    self.import_crossref_dict(obj, editgroup=eg,
-                        create_containers=create_containers)
-                except Exception as e:
-                    print("ERROR: {}".format(e))
-                if i % batchsize == 0:
-                    self.accept_editgroup(eg)
-                    eg = self.new_editgroup()
-        if i % batchsize != 0:
-            self.accept_editgroup(eg)
-        print("done!")
-
-    def import_crossref_dict(self, meta, editgroup=None,
-            create_containers=False):
-
-        # creators
-        creators = []
-        for am in meta['author']:
-            c = dict(name="{} {}".format(am['given'], am['family']),
-                     sortname="{}, {}".format(am['family'], am['given']),
-                     orcid=None)
-            creators.append(c)
-
-        # container
-        issn = meta.get('ISSN', [None])[0]
-        container_id = self.lookup_issn(issn)
-        container = dict(
-            issn=issn,
-            name=meta['container-title'][0],
-            container=container_id,
-            #sortname=meta['short-container-title'][0])
-            publisher=meta['publisher'])
-
-        if container_id is None and create_containers and issn != None:
-            rv = self.post('/v0/container', data=dict(
-                issn=container['issn'],
-                publisher=container['publisher']))
-            assert rv.status_code == 201
-            container_id = rv.json()['id']
-            print("created container: {}".format(issn))
-            container['id'] = container_id
-            self._issn_map[issn] = container_id
-
-        # references
-        refs = []
-        for i, rm in enumerate(meta.get('reference', [])):
-            ref = dict(
-                doi=rm.get("DOI", None),
-                index=i+1,
-                # TODO: how to generate a proper stub here from k/v metadata?
-                stub="| ".join(rm.values()))
-            refs.append(ref)
-
-        # work and release
-        title = meta['title'][0]
-        rv = self.post('/v0/work',
-            data=dict(title=title, editgroup=editgroup)) #work_type="book"
-        assert rv.status_code == 201
-        work_id = rv.json()['id']
-
-        extra = dict(crossref={
-            'links': meta.get('link', []),
-            'subject': meta.get('subject'),
-            'type': meta['type'],
-            'alternative-id': meta.get('alternative-id', [])})
-
-        rv = self.post('/v0/release', data=dict(
-            title=title,
-            work=work_id,
-            # XXX: creators=creators,
-            # XXX: refs=refs,
-            # XXX: container=container_id,
-            release_type=meta['type'],
-            doi=meta['DOI'],
-            date=meta['created']['date-time'],
-            license=meta.get('license', [dict(URL=None)])[0]['URL'] or None,
-            issue=meta.get('issue', None),
-            volume=meta.get('volume', None),
-            pages=meta.get('page', None),
-            editgroup=editgroup,
-            extra=extra))
-        assert rv.status_code == 201
-        release_id = rv.json()['id']
-
-    def import_issn_file(self, json_file, create_containers=False, batchsize=100):
-        eg = self.new_editgroup()
-        i = 0
-        with open(json_file, 'r') as file:
-            for line in file:
-                if i % batchsize == 0:
-                    sys.stdout.write('\n{}: '.format(i))
-                if (i+1) % 20 == 0:
-                    sys.stdout.write('.')
-                i = i + 1
-                obj = json.loads(line)
-                if not ("author" in obj and "title" in obj):
-                    continue
-                try:
-                    self.import_crossref_dict(obj, editgroup=eg,
-                        create_containers=create_containers)
-                except Exception as e:
-                    print("ERROR: {}".format(e))
-                if i % batchsize == 0:
-                    self.accept_editgroup(eg)
-                    eg = self.new_editgroup()
-        if i % batchsize != 0:
-            self.accept_editgroup(eg)
-        print("done!")
-
-    def health(self):
-        rv = self.get("/health")
-        assert rv.status_code == 200
-        return rv.json()
diff --git a/python/fatcat/crossref_importer.py b/python/fatcat/crossref_importer.py
new file mode 100644
index 00000000..4c68230d
--- /dev/null
+++ b/python/fatcat/crossref_importer.py
@@ -0,0 +1,112 @@
+
+import sys
+import json
+import itertools
+import fatcat_client
+from fatcat.importer_common import FatcatImporter
+
+
+class FatcatCrossrefImporter(FatcatImporter):
+
+    # TODO: overload __init__ to handle create_containers
+
+    def parse_crossref_dict(self, obj):
+        """
+        obj is a python dict (parsed from json).
+        returns a ReleaseEntity
+        """
+
+        # contribs
+        contribs = []
+        for i, am in enumerate(obj['author']):
+            contribs.append(fatcat_client.ReleaseContrib(
+                creator_id=None, # TODO: orcid lookup
+                index=i,
+                # Sorry humans :(
+                raw="{} {}".format(am['given'], am['family']),
+                role="author"))
+
+        # container
+        # TODO: ISSN vs. ISSN-L
+        issn = obj.get('ISSN', [None])[0]
+        container_id = self.lookup_issnl(issn)
+
+        ## TODO: create containers in-line like this?
+        #container = dict(
+        #    issn=issn,
+        #    name=obj['container-title'][0],
+        #    container=container_id,
+        #    #sortname=obj['short-container-title'][0])
+        #    publisher=obj['publisher'])
+        #if container_id is None and self.create_containers and issn != None:
+        #    rv = self.post('/v0/container', data=dict(
+        #        issn=container['issn'],
+        #        publisher=container['publisher']))
+        #    assert rv.status_code == 201
+        #    container_id = rv.json()['id']
+        #    print("created container: {}".format(issn))
+        #    container['id'] = container_id
+        #    self._issn_map[issn] = container_id
+
+        # references
+        refs = []
+        for i, rm in enumerate(obj.get('reference', [])):
+            refs.append(fatcat_client.ReleaseRef(
+                index=i,
+                target_release_id=None, # TODO: DOI lookup: rm.get("DOI", None),
+                # TODO: all these
+                key=None,
+                year=None,
+                container_title=None,
+                title=None,
+                locator=None,
+                # TODO: how to generate a proper stub here from k/v objdata?
+                # TODO: just dump JSON here if we didn't get a match?
+                raw="| ".join(rm.values())))
+
+        # work
+        we = fatcat_client.WorkEntity(
+            work_type=obj['type'],
+        )
+
+        # release
+        extra = dict(crossref={
+            'links': obj.get('link', []),
+            'subject': obj.get('subject'),
+            'crossref-type': obj['type'],
+            'alternative-id': obj.get('alternative-id', [])})
+
+        re = fatcat_client.ReleaseEntity(
+            work_id='null', # XXX:
+            title=obj['title'][0],
+            contribs=contribs,
+            refs=refs,
+            container_id=container_id,
+            release_type=obj['type'],
+            doi=obj['DOI'],
+            release_date=obj['created']['date-time'],
+            #license=obj.get('license', [dict(URL=None)])[0]['URL'] or None,
+            issue=obj.get('issue'),
+            volume=obj.get('volume'),
+            pages=obj.get('page'),
+            extra=extra)
+        return (we, re)
+
+    def create_row(self, row, editgroup_id=None):
+        if row is None:
+            continue
+        obj = json.loads(row)
+        both = self.parse_crossref_dict(obj)
+        if both is not None:
+            (we, re) = both
+            we.editgroup_id = editgroup_id
+            re.editgroup_id = editgroup_id
+            created = self.api.create_work(we)
+            re.work_id = created.ident
+            self.api.create_release(re)
+
+    def create_batch(self, batch, editgroup_id=None):
+        """Current work/release pairing disallows batch creation of releases.
+        Could do batch work creation and then match against releases, but meh."""
+        for row in batch:
+            self.create_row(row, editgroup_id)
diff --git a/python/fatcat/importer_common.py b/python/fatcat/importer_common.py
new file mode 100644
index 00000000..98bfb26e
--- /dev/null
+++ b/python/fatcat/importer_common.py
@@ -0,0 +1,53 @@
+
+import sys
+import json
+import itertools
+import fatcat_client
+from fatcat_client.rest import ApiException
+
+# from: https://docs.python.org/3/library/itertools.html
+def grouper(iterable, n, fillvalue=None):
+    "Collect data into fixed-length chunks or blocks"
+    args = [iter(iterable)] * n
+    return itertools.zip_longest(*args, fillvalue=fillvalue)
+
+class FatcatImporter:
+
+    def __init__(self, host_url):
+        conf = fatcat_client.Configuration()
+        conf.host = host_url
+        self.api = fatcat_client.DefaultApi(fatcat_client.ApiClient(conf))
+        self._issnl_map = dict()
+
+    def process_source(self, source, group_size=100):
+        """Creates and auto-accepts editgropu every group_size rows"""
+        eg = self.api.create_editgroup(fatcat_client.Editgroup(editor_id=1))
+        for i, row in enumerate(source):
+            self.create_row(row, editgroup_id=eg.id)
+            if i > 0 and (i % group_size) == 0:
+                self.api.accept_editgroup(eg)
+                eg = self.api.create_editgroup(fatcat_client.Editgroup(editor_id=1))
+        if i == 0 or (i % group_size) != 0:
+            self.api.accept_editgroup(eg.id)
+
+    def process_batch(self, source, size=50):
+        """Reads and processes in batches (not API-call-per-)"""
+        for rows in grouper(source, size):
+            eg = self.api.create_editgroup(fatcat_client.Editgroup(editor_id=1))
+            self.create_batch(rows, eg.id)
+            self.api.accept_editgroup(eg.id)
+
+    def lookup_issnl(self, issnl):
+        """Caches calls to the ISSN-L lookup API endpoint in a local dict"""
+        assert len(issnl) == 9 and issnl[4] == '-'
+        if issnl in self._issnl_map:
+            return self._issnl_map[issn]
+        container_id = None
+        try:
+            rv = self.api.lookup_container(issnl=issnl)
+            container_id = rv.ident
+        except ApiException as ae:
+            # If anything other than a 404 (not found), something is wrong
+            assert ae.status == 404
+        self._issnl_map[issnl] = container_id # might be None
+        return container_id
diff --git a/python/fatcat/orcid_importer.py b/python/fatcat/orcid_importer.py
index ba8d0bd7..fb4716df 100644
--- a/python/fatcat/orcid_importer.py
+++ b/python/fatcat/orcid_importer.py
@@ -3,6 +3,8 @@ import sys
 import json
 import itertools
 import fatcat_client
+from fatcat.importer_common import FatcatImporter
+
 
 def value_or_none(e):
     if type(e) == dict:
@@ -11,18 +13,7 @@ def value_or_none(e):
         e = None
     return e
 
-# from: https://docs.python.org/3/library/itertools.html
-def grouper(iterable, n, fillvalue=None):
-    "Collect data into fixed-length chunks or blocks"
-    args = [iter(iterable)] * n
-    return itertools.zip_longest(*args, fillvalue=fillvalue)
-
-class FatcatOrcidImporter:
-
-    def __init__(self, host_url):
-        conf = fatcat_client.Configuration()
-        conf.host = host_url
-        self.api = fatcat_client.DefaultApi(fatcat_client.ApiClient(conf))
+class FatcatOrcidImporter(FatcatImporter):
 
     def parse_orcid_dict(self, obj):
         """
@@ -47,34 +38,18 @@ class FatcatOrcidImporter:
             extra=extra)
         return ce
 
-    def process_line(self, line, editgroup_id=None):
-        """Doesn't accept the editgroup"""
-        obj = json.loads(line)
+    def create_row(self, row, editgroup_id=None):
+        obj = json.loads(row)
         ce = self.parse_orcid_dict(obj)
         if ce is not None:
             ce.editgroup_id = editgroup_id
             self.api.create_creator(ce)
 
-    def process_source(self, source, group_size=100):
-        """Creates and auto-accepts editgropu every group_size lines"""
-        eg = self.api.create_editgroup(fatcat_client.Editgroup(editor_id=1))
-        for i, line in enumerate(source):
-            self.process_line(line, editgroup_id=eg.id)
-            if i > 0 and (i % group_size) == 0:
-                self.api.accept_editgroup(eg)
-                eg = self.api.create_editgroup(fatcat_client.Editgroup(editor_id=1))
-        if i == 0 or (i % group_size) != 0:
-            self.api.accept_editgroup(eg.id)
-
-    def process_batch(self, source, size=50):
+    def create_batch(self, batch, editgroup_id=None):
         """Reads and processes in batches (not API-call-per-line)"""
-        for lines in grouper(source, size):
-            objects = [self.parse_orcid_dict(json.loads(l))
-                       for l in lines if l != None]
-            objects = [o for o in objects if o != None]
-            eg = self.api.create_editgroup(fatcat_client.Editgroup(editor_id=1))
-            for o in objects:
-                o.editgroup_id = eg.id
-            self.api.create_creator_batch(objects)
-            self.api.accept_editgroup(eg.id)
-            print("inserted {}".format(len(objects)))
+        objects = [self.parse_orcid_dict(json.loads(l))
+                   for l in batch if l != None]
+        objects = [o for o in objects if o != None]
+        for o in objects:
+            o.editgroup_id = editgroup_id
+        self.api.create_creator_batch(objects)
diff --git a/python/fatcat/raw_api_client.py b/python/fatcat/raw_api_client.py
new file mode 100644
index 00000000..75151ebb
--- /dev/null
+++ b/python/fatcat/raw_api_client.py
@@ -0,0 +1,66 @@
+
+import sys
+import json
+import requests
+
+
+class RawFatcatApiClient:
+
+    def __init__(self, host_url):
+        self.host_url = host_url
+        self.session = requests.Session()
+        self._issn_map = dict()
+
+    def get(self, path, data=None):
+        headers = {"content-type": "application/json"}
+        return self.session.get(self.host_url + path, json=data,
+            headers=headers)
+
+    def post(self, path, data=None):
+        headers = {"content-type": "application/json"}
+        return self.session.post(self.host_url + path, json=data,
+            headers=headers)
+
+    def new_editgroup(self):
+        rv = self.post('/v0/editgroup', data=dict(
+            editor_id=1))
+        print(rv)
+        print(rv.json())
+        assert rv.status_code == 201
+        editgroup_id = rv.json()['id']
+        return editgroup_id
+
+    def accept_editgroup(self, eg):
+        rv = self.post('/v0/editgroup/{}/accept'.format(eg))
+        assert rv.status_code == 200
+        return rv
+
+    def import_issn_file(self, json_file, create_containers=False, batchsize=100):
+        eg = self.new_editgroup()
+        i = 0
+        with open(json_file, 'r') as file:
+            for line in file:
+                if i % batchsize == 0:
+                    sys.stdout.write('\n{}: '.format(i))
+                if (i+1) % 20 == 0:
+                    sys.stdout.write('.')
+                i = i + 1
+                obj = json.loads(line)
+                if not ("author" in obj and "title" in obj):
+                    continue
+                try:
+                    self.import_crossref_dict(obj, editgroup=eg,
+                        create_containers=create_containers)
+                except Exception as e:
+                    print("ERROR: {}".format(e))
+                if i % batchsize == 0:
+                    self.accept_editgroup(eg)
+                    eg = self.new_editgroup()
+        if i % batchsize != 0:
+            self.accept_editgroup(eg)
+        print("done!")
+
+    def health(self):
+        rv = self.get("/health")
+        assert rv.status_code == 200
+        return rv.json()
diff --git a/python/tests/api_client.py b/python/tests/api_client.py
deleted file mode 100644
index 9d2ec302..00000000
--- a/python/tests/api_client.py
+++ /dev/null
@@ -1,15 +0,0 @@
-
-import pytest
-import fatcat.api_client
-from fixtures import *
-
-
-# TODO:
-#def test_client_health(api_client):
-#    assert api_client.health() != None
-
-
-def test_import_crossref(api_client):
-    api_client.import_crossref_file('tests/files/crossref-works.2018-01-21.badsample.json')
-
-    # TODO: use API to check that entities actually created...
diff --git a/python/tests/crossref.py b/python/tests/crossref.py
new file mode 100644
index 00000000..e72f7d02
--- /dev/null
+++ b/python/tests/crossref.py
@@ -0,0 +1,16 @@
+
+import pytest
+from fatcat.crossref_importer import FatcatCrossrefImporter
+
+
+@pytest.fixture(scope="function")
+def crossref_importer():
+    yield FatcatCrossrefImporter("http://localhost:9411/v0")
+
+def test_crossref_importer_batch(crossref_importer):
+    with open('tests/files/crossref-works.2018-01-21.badsample.json', 'r') as f:
+        crossref_importer.process_batch(f)
+
+def test_crossref_importer(crossref_importer):
+    with open('tests/files/crossref-works.2018-01-21.badsample.json', 'r') as f:
+        crossref_importer.process_source(f)
diff --git a/python/tests/fixtures.py b/python/tests/fixtures.py
index b1682e79..c9da9253 100644
--- a/python/tests/fixtures.py
+++ b/python/tests/fixtures.py
@@ -19,23 +19,12 @@ def app(full_app):
 
 
 @pytest.fixture(scope="function")
-def api_client(full_app):
-
-    # TODO:
-    #pid = os.fork()
-    #if pid == 0:
-    #    full_app.testing = False
-    #    full_app.run(host="localhost", port=8444, debug=False)
-    #    os._exit(0)
-    #
-    #time.sleep(0.2)
-    #yield fatcat.api_client.FatCatApiClient("http://localhost:8444")
-    #os.kill(pid, signal.SIGKILL)
-
-    yield fatcat.api_client.FatCatApiClient("http://localhost:9411")
+def raw_api_client():
+    yield fatcat.raw_api_client.RawFatcatApiClient("http://localhost:9411")
 
 
 ## Helpers ##################################################################
+# TODO: what are these even here for?
 
 def check_entity_fields(e):
     for key in ('rev', 'is_live', 'redirect_id'):
diff --git a/python/tests/orcid.py b/python/tests/orcid.py
index 86a23603..d0e99cfc 100644
--- a/python/tests/orcid.py
+++ b/python/tests/orcid.py
@@ -2,10 +2,12 @@
 import pytest
 from fatcat.orcid_importer import FatcatOrcidImporter
 
+
 @pytest.fixture(scope="function")
 def orcid_importer():
     yield FatcatOrcidImporter("http://localhost:9411/v0")
 
+# TODO: use API to check that entities actually created...
 def test_orcid_importer_batch(orcid_importer):
     with open('tests/files/0000-0001-8254-7103.json', 'r') as f:
         orcid_importer.process_batch(f)
author	Bryan Newbold <bnewbold@robocracy.org>	2018-06-20 09:37:37 -0700
committer	Bryan Newbold <bnewbold@robocracy.org>	2018-06-20 09:37:37 -0700
commit	bde5c8f14e13afe4d54e9bfafd8bda8b0f33f804 (patch)
tree	67d7039b1621bebdafd89539602c2b5d05332501 /python
parent	698399c49edcefe33c012856b604985925969a77 (diff)
download	fatcat-bde5c8f14e13afe4d54e9bfafd8bda8b0f33f804.tar.gz fatcat-bde5c8f14e13afe4d54e9bfafd8bda8b0f33f804.zip