summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2018-09-14 15:02:52 -0700
committerBryan Newbold <bnewbold@robocracy.org>2018-09-14 15:06:47 -0700
commit61caceebcc5cd04b28d9859b27ac314bb2a59bbb (patch)
treeee70241ade0fb769e33b0312873826d243740282
parentac0b49ee3e04d98ad5b6dd8c2360a71d7ecce1a3 (diff)
downloadfatcat-61caceebcc5cd04b28d9859b27ac314bb2a59bbb.tar.gz
fatcat-61caceebcc5cd04b28d9859b27ac314bb2a59bbb.zip
add insert counting to importers
-rw-r--r--python/fatcat/crossref_importer.py2
-rw-r--r--python/fatcat/importer_common.py9
-rw-r--r--python/fatcat/issn_importer.py2
-rw-r--r--python/fatcat/orcid_importer.py2
-rwxr-xr-xpython/fatcat_import.py22
5 files changed, 37 insertions, 0 deletions
diff --git a/python/fatcat/crossref_importer.py b/python/fatcat/crossref_importer.py
index a2e14ed1..2154c8c0 100644
--- a/python/fatcat/crossref_importer.py
+++ b/python/fatcat/crossref_importer.py
@@ -160,6 +160,7 @@ class FatcatCrossrefImporter(FatcatImporter):
re.container_id = container.ident
self._issnl_id_map[ce.issnl] = container.ident
self.api.create_release(re, editgroup=editgroup)
+ self.insert_count = self.insert_count + 1
def create_batch(self, batch, editgroup=None):
"""Current work/release pairing disallows batch creation of releases.
@@ -178,3 +179,4 @@ class FatcatCrossrefImporter(FatcatImporter):
self._issnl_id_map[ce.issnl] = container.ident
release_batch.append(re)
self.api.create_release_batch(release_batch, autoaccept="true", editgroup=editgroup)
+ self.insert_count = self.insert_count + len(release_batch)
diff --git a/python/fatcat/importer_common.py b/python/fatcat/importer_common.py
index ff0c8a27..95bec8a1 100644
--- a/python/fatcat/importer_common.py
+++ b/python/fatcat/importer_common.py
@@ -26,6 +26,13 @@ class FatcatImporter:
self._orcid_regex = re.compile("^\\d{4}-\\d{4}-\\d{4}-\\d{3}[\\dX]$")
if issn_map_file:
self.read_issn_map_file(issn_map_file)
+ self.processed_lines = 0
+ self.insert_count = 0
+ self.update_count = 0
+
+ def describe_run(self):
+ print("Processed {} lines, inserted {}, updated {}.".format(
+ self.processed_lines, self.insert_count, self.update_count))
def process_source(self, source, group_size=100):
"""Creates and auto-accepts editgroup every group_size rows"""
@@ -37,12 +44,14 @@ class FatcatImporter:
self.api.accept_editgroup(eg)
eg = self.api.create_editgroup(
fatcat_client.Editgroup(editor_id='aaaaaaaaaaaabkvkaaaaaaaaae'))
+ self.processed_lines = self.processed_lines + 1
if i == 0 or (i % group_size) != 0:
self.api.accept_editgroup(eg.id)
def process_batch(self, source, size=50):
"""Reads and processes in batches (not API-call-per-)"""
for rows in grouper(source, size):
+ self.processed_lines = self.processed_lines + len(rows)
eg = self.api.create_editgroup(
fatcat_client.Editgroup(editor_id='aaaaaaaaaaaabkvkaaaaaaaaae'))
self.create_batch(rows, editgroup=eg.id)
diff --git a/python/fatcat/issn_importer.py b/python/fatcat/issn_importer.py
index ad2cad78..c9ef50b5 100644
--- a/python/fatcat/issn_importer.py
+++ b/python/fatcat/issn_importer.py
@@ -61,6 +61,7 @@ class FatcatIssnImporter(FatcatImporter):
ce = self.parse_issn_row(row)
if ce is not None:
self.api.create_container(ce, editgroup=editgroup)
+ self.insert_count = self.insert_count + 1
def create_batch(self, batch, editgroup=None):
"""Reads and processes in batches (not API-call-per-line)"""
@@ -68,3 +69,4 @@ class FatcatIssnImporter(FatcatImporter):
for l in batch if l != None]
objects = [o for o in objects if o != None]
self.api.create_container_batch(objects, autoaccept="true", editgroup=editgroup)
+ self.insert_count = self.insert_count + len(objects)
diff --git a/python/fatcat/orcid_importer.py b/python/fatcat/orcid_importer.py
index 2eeac122..e1f5943c 100644
--- a/python/fatcat/orcid_importer.py
+++ b/python/fatcat/orcid_importer.py
@@ -62,6 +62,7 @@ class FatcatOrcidImporter(FatcatImporter):
ce = self.parse_orcid_dict(obj)
if ce is not None:
self.api.create_creator(ce, editgroup=editgroup)
+ self.insert_count = self.insert_count + 1
def create_batch(self, batch, editgroup=None):
"""Reads and processes in batches (not API-call-per-line)"""
@@ -69,3 +70,4 @@ class FatcatOrcidImporter(FatcatImporter):
for l in batch if l != None]
objects = [o for o in objects if o != None]
self.api.create_creator_batch(objects, autoaccept="true", editgroup=editgroup)
+ self.insert_count = self.insert_count + len(objects)
diff --git a/python/fatcat_import.py b/python/fatcat_import.py
index 2f0c746f..525cf286 100755
--- a/python/fatcat_import.py
+++ b/python/fatcat_import.py
@@ -12,18 +12,28 @@ def run_import_crossref(args):
fci = FatcatCrossrefImporter(args.host_url, args.issn_map_file,
args.extid_map_file, create_containers=(not args.no_create_containers))
fci.process_batch(args.json_file, size=args.batch_size)
+ fci.describe_run()
def run_import_orcid(args):
foi = FatcatOrcidImporter(args.host_url)
foi.process_batch(args.json_file, size=args.batch_size)
+ foi.describe_run()
def run_import_issn(args):
fii = FatcatIssnImporter(args.host_url)
fii.process_csv_batch(args.csv_file, size=args.batch_size)
+ fii.describe_run()
def run_import_manifest(args):
fmi = FatcatManifestImporter(args.host_url)
fmi.process_db(args.db_path, size=args.batch_size)
+ fmi.describe_run()
+
+def run_import_matched(args):
+ fmi = FatcatMatchedImporter(args.host_url,
+ skip_file_update=args.no_file_update)
+ fmi.process_db(args.db_path, size=args.batch_size)
+ fmi.describe_run()
def health(args):
rfac = RawFatcatApiClient(args.host_url)
@@ -84,6 +94,18 @@ def main():
help="size of batch to send",
default=50, type=int)
+ sub_import_matched = subparsers.add_parser('import-matched')
+ sub_import_matched.set_defaults(func=run_import_matched)
+ sub_import_matched.add_argument('json_file',
+ help="JSON file to import from (or stdin)",
+ default=sys.stdin, type=argparse.FileType('r'))
+ sub_import_matched.add_argument('--no-file-update',
+ action='store_true',
+ help="don't lookup existing files, just insert (only for bootstrap)")
+ sub_import_matched.add_argument('--batch-size',
+ help="size of batch to send",
+ default=50, type=int)
+
sub_health = subparsers.add_parser('health')
sub_health.set_defaults(func=health)