aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat/importer_common.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/fatcat/importer_common.py')
-rw-r--r--python/fatcat/importer_common.py53
1 files changed, 53 insertions, 0 deletions
diff --git a/python/fatcat/importer_common.py b/python/fatcat/importer_common.py
new file mode 100644
index 00000000..98bfb26e
--- /dev/null
+++ b/python/fatcat/importer_common.py
@@ -0,0 +1,53 @@
+
+import sys
+import json
+import itertools
+import fatcat_client
+from fatcat_client.rest import ApiException
+
+# from: https://docs.python.org/3/library/itertools.html
+def grouper(iterable, n, fillvalue=None):
+ "Collect data into fixed-length chunks or blocks"
+ args = [iter(iterable)] * n
+ return itertools.zip_longest(*args, fillvalue=fillvalue)
+
+class FatcatImporter:
+
+ def __init__(self, host_url):
+ conf = fatcat_client.Configuration()
+ conf.host = host_url
+ self.api = fatcat_client.DefaultApi(fatcat_client.ApiClient(conf))
+ self._issnl_map = dict()
+
+ def process_source(self, source, group_size=100):
+ """Creates and auto-accepts editgropu every group_size rows"""
+ eg = self.api.create_editgroup(fatcat_client.Editgroup(editor_id=1))
+ for i, row in enumerate(source):
+ self.create_row(row, editgroup_id=eg.id)
+ if i > 0 and (i % group_size) == 0:
+ self.api.accept_editgroup(eg)
+ eg = self.api.create_editgroup(fatcat_client.Editgroup(editor_id=1))
+ if i == 0 or (i % group_size) != 0:
+ self.api.accept_editgroup(eg.id)
+
+ def process_batch(self, source, size=50):
+ """Reads and processes in batches (not API-call-per-)"""
+ for rows in grouper(source, size):
+ eg = self.api.create_editgroup(fatcat_client.Editgroup(editor_id=1))
+ self.create_batch(rows, eg.id)
+ self.api.accept_editgroup(eg.id)
+
+ def lookup_issnl(self, issnl):
+ """Caches calls to the ISSN-L lookup API endpoint in a local dict"""
+ assert len(issnl) == 9 and issnl[4] == '-'
+ if issnl in self._issnl_map:
+ return self._issnl_map[issn]
+ container_id = None
+ try:
+ rv = self.api.lookup_container(issnl=issnl)
+ container_id = rv.ident
+ except ApiException as ae:
+ # If anything other than a 404 (not found), something is wrong
+ assert ae.status == 404
+ self._issnl_map[issnl] = container_id # might be None
+ return container_id